-
Notifications
You must be signed in to change notification settings - Fork 0
/
DT Classifier for Drugs dataset.py
172 lines (116 loc) · 5.68 KB
/
DT Classifier for Drugs dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
"""task1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Bu5XKQTtlxj01cp2M-frr-icJjMy7Bix
"""
from google.colab import files
uploaded = files.upload()#taking input the csv file
import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['drugs.csv']))
"""Task 1: Decision tree classifier
1 Identify the features and target from the data.
"""
"""Features-Age, Sex, BP, Cholesterol, Na_to_K
Label-Drug
2 Look for missing values in the data, if any, and address them accordingly.
"""
data.isnull() # all are giving same boolean value it means that there is no missing value in the dataset
"""3 Find out the ordinal/nominal/categorical data, if any, and convert them into numerical equivalent.
Sex-Nominal, BP-Ordinal, Cholesterol-Ordinal, Drug-Nominal, Categorical- Sex, BP, Cholesterol, Drug
"""
from sklearn.preprocessing import OrdinalEncoder # to encode male and female in 0s and 1s
data['Sex'] = data['Sex'].astype('category')
data['Sex'] = data['Sex'].cat.codes
data['BP'] = data['BP'].astype('category')
data['BP'] = data['BP'].cat.codes
data['Cholesterol'] = data['Cholesterol'].astype('category')
data['Cholesterol'] = data['Cholesterol'].cat.codes
data['Drug'] = data['Drug'].astype('category')
data['Drug'] = data['Drug'].cat.codes
data
"""4,5 Split the dataset into 70:30, 80:20, and 90:10. 5 For reproducibility, set seed = 55 throughout."""
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.3, shuffle = True, random_state=55) # taking testing data to be 30%
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]
data1=data.copy()
train_data1, test_data1 = train_test_split(data1, test_size=0.2, shuffle = True, random_state=55) # taking testing data to be 20%
X_train_1 = train_data1.iloc[:,:-1]
y_train_1 = train_data1.iloc[:,-1]
X_test_1 = test_data1.iloc[:,:-1]
y_test_1 = test_data1.iloc[:,-1]
data2=data.copy()
train_data2, test_data2 = train_test_split(data2, test_size=0.1, shuffle = True, random_state=55) # taking testing data to be 10%
X_train_2 = train_data2.iloc[:,:-1]
y_train_2 = train_data2.iloc[:,-1]
X_test_2 = test_data2.iloc[:,:-1]
y_test_2 = test_data2.iloc[:,-1]
"""6 Use Entropy information gain for 80:20 split and Gini-index for the rest. 7 Train a decision tree classifier and report model accuracy."""
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
classifier1 = DecisionTreeClassifier(criterion="entropy", random_state=55, max_depth=3)
classifier1.fit(X_train_1,y_train_1)
y_pred_1 = classifier1.predict(X_test_1)
print("Accuracy:",metrics.accuracy_score(y_test_1, y_pred_1))
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
classifier = DecisionTreeClassifier(criterion="gini", random_state=55, max_depth=3)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
classifier2 = DecisionTreeClassifier(criterion="gini", random_state=55, max_depth=3)
classifier2.fit(X_train_2,y_train_2)
y_pred_2 = classifier2.predict(X_test_2)
print("Accuracy:",metrics.accuracy_score(y_test_2, y_pred_2))
"""8 Prepare confusion matrix and classification report."""
from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_1, y_pred_1))
cnf_matrix = metrics.confusion_matrix(y_test_1, y_pred_1)
print(cnf_matrix)
from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test_2, y_pred_2))
cnf_matrix = metrics.confusion_matrix(y_test_2, y_pred_2)
print(cnf_matrix)
"""9 Provide a graphical visualization of the tree."""
import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png")
graph
import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier1,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png")
graph
import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier2,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png")
graph
"""10 Comment on overfitting."""
y_train_pred = classifier.predict(X_train)
from sklearn.metrics import accuracy_score
print('Model Accuracy',accuracy_score(y_train_pred, y_train))
#Since the model has a less differnce of accuracy between test and model accuracy and also the model accuracy is not exactly 1 which implies that model has not overfitted itself
print('Test Accuracy',accuracy_score(y_test, y_pred))
y_train_pred_1 = classifier.predict(X_train_1)
print('Model Accuracy',accuracy_score(y_train_pred_1, y_train_1))
# Here we see that the model's accuracy is higher than the test accuracy which implies that the model has somewhat tried to overfit the data but not much because our test accuracy is also high and not much small wrt the model's.
print('Test Accuracy',accuracy_score(y_test_1, y_pred_1))
y_train_pred_2 = classifier.predict(X_train_2)
print('Model Accuracy',accuracy_score(y_train_pred_2, y_train_2))
#Since the model has a less differnce of accuracy between test and model accuracy and also the model accuracy is not exactly 1 which implies that model has not overfitted itself
print('Test Accuracy',accuracy_score(y_test_2, y_pred_2))