DT Classifier for Drugs dataset.py

# -*- coding: utf-8 -*-
"""task1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Bu5XKQTtlxj01cp2M-frr-icJjMy7Bix
"""

from google.colab import files
uploaded = files.upload()#taking input the csv file

import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['drugs.csv']))


"""Task 1: Decision tree classifier


1 Identify the features and target from the data.

"""


"""Features-Age, Sex, BP, Cholesterol, Na_to_K
Label-Drug

2 Look for missing values in the data, if any, and address them accordingly.
"""

data.isnull() # all are giving same boolean value it means that there is no missing value in the dataset

"""3 Find out the ordinal/nominal/categorical data, if any, and convert them into numerical equivalent.

Sex-Nominal, BP-Ordinal, Cholesterol-Ordinal, Drug-Nominal, Categorical- Sex, BP, Cholesterol, Drug
"""

from sklearn.preprocessing import OrdinalEncoder # to encode male and female in 0s and 1s

data['Sex'] = data['Sex'].astype('category')
data['Sex'] = data['Sex'].cat.codes
data['BP'] = data['BP'].astype('category')
data['BP'] = data['BP'].cat.codes
data['Cholesterol'] = data['Cholesterol'].astype('category')
data['Cholesterol'] = data['Cholesterol'].cat.codes
data['Drug'] = data['Drug'].astype('category')
data['Drug'] = data['Drug'].cat.codes

data

"""4,5 Split the dataset into 70:30, 80:20, and 90:10. 5 For reproducibility, set seed = 55 throughout."""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.3, shuffle = True, random_state=55) # taking testing data to be 30%
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]


data1=data.copy()
train_data1, test_data1 = train_test_split(data1, test_size=0.2, shuffle = True, random_state=55) # taking testing data to be 20%
X_train_1 = train_data1.iloc[:,:-1]
y_train_1 = train_data1.iloc[:,-1]
X_test_1 = test_data1.iloc[:,:-1]
y_test_1 = test_data1.iloc[:,-1]


data2=data.copy()

train_data2, test_data2 = train_test_split(data2, test_size=0.1, shuffle = True, random_state=55) # taking testing data to be 10%
X_train_2 = train_data2.iloc[:,:-1]
y_train_2 = train_data2.iloc[:,-1]
X_test_2 = test_data2.iloc[:,:-1]
y_test_2 = test_data2.iloc[:,-1]

"""6 Use Entropy information gain for 80:20 split and Gini-index for the rest. 7 Train a decision tree classifier and report model accuracy."""

from sklearn.tree import DecisionTreeClassifier  
from sklearn import metrics 
classifier1 = DecisionTreeClassifier(criterion="entropy", random_state=55, max_depth=3)
classifier1.fit(X_train_1,y_train_1)
y_pred_1 = classifier1.predict(X_test_1)

print("Accuracy:",metrics.accuracy_score(y_test_1, y_pred_1))

from sklearn.tree import DecisionTreeClassifier  
from sklearn import metrics 
classifier = DecisionTreeClassifier(criterion="gini", random_state=55, max_depth=3)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

from sklearn.tree import DecisionTreeClassifier  
from sklearn import metrics 
classifier2 = DecisionTreeClassifier(criterion="gini", random_state=55, max_depth=3)
classifier2.fit(X_train_2,y_train_2)
y_pred_2 = classifier2.predict(X_test_2)

print("Accuracy:",metrics.accuracy_score(y_test_2, y_pred_2))

"""8 Prepare confusion matrix and classification report."""

from sklearn import metrics

from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

print(cnf_matrix)

from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_1, y_pred_1))
cnf_matrix = metrics.confusion_matrix(y_test_1, y_pred_1)
print(cnf_matrix)

from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_2, y_pred_2))
cnf_matrix = metrics.confusion_matrix(y_test_2, y_pred_2)
print(cnf_matrix)

"""9 Provide a graphical visualization of the tree."""

import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png") 
graph

import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier1,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png") 
graph

import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier2,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png") 
graph

"""10 Comment on overfitting."""

y_train_pred = classifier.predict(X_train)
from sklearn.metrics import accuracy_score
print('Model Accuracy',accuracy_score(y_train_pred, y_train))
#Since the model has a less differnce of accuracy between test and model accuracy and also the model accuracy is not exactly 1 which implies that model has not overfitted itself

print('Test Accuracy',accuracy_score(y_test, y_pred))

y_train_pred_1 = classifier.predict(X_train_1)
print('Model Accuracy',accuracy_score(y_train_pred_1, y_train_1))

# Here we see that the model's accuracy is higher than the test accuracy which implies that the model has somewhat tried to overfit the data but not much because our test accuracy is also high and not much small wrt the model's.  
print('Test Accuracy',accuracy_score(y_test_1, y_pred_1))

y_train_pred_2 = classifier.predict(X_train_2)
print('Model Accuracy',accuracy_score(y_train_pred_2, y_train_2))
#Since the model has a less differnce of accuracy between test and model accuracy and also the model accuracy is not exactly 1 which implies that model has not overfitted itself
print('Test Accuracy',accuracy_score(y_test_2, y_pred_2))