DT Regressor.py

# -*- coding: utf-8 -*-
"""Task2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Gmr3HV-_31TMX_NlPYZh4XeXqvT91u8E
"""

from google.colab import files
uploaded = files.upload()#taking the csv file

import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['Concrete_Data.xls - Sheet1.csv']))

"""1 Identify the features and target from the data.

Features- Cement (component 1)(kg in a m^3 mixture)	Blast Furnace Slag (component 2)(kg in a m^3 mixture)	Fly Ash (component 3)(kg in a m^3 mixture)	Water  (component 4)(kg in a m^3 mixture)	Superplasticizer (component 5)(kg in a m^3 mixture)	Coarse Aggregate  (component 6)(kg in a m^3 mixture)	Fine Aggregate (component 7)(kg in a m^3 mixture)	Age (day)

Target-Concrete compressive strength(MPa, megapascals)

2 Split dataset into training and test set using the following formula:
Let your roll number be B20XX207, and the last three digits of your roll number be S. If
S is odd split ratio is 70:30, and if it is even then, the split ratio is 80:20. In the above
example, S turns out to be 207, which is odd; hence split the data in 70:30.

4 Seed=2021

B20ME010  010-Even, Splitting will be in 80:20
"""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, shuffle = True, random_state=2021) # taking testing data to be 20%
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]

"""6 Train a decision tree regressor and report model accuracy.

3 Use MSE and MAE

4 Seed=2021, 5 set node selection strategy as ‘best’.
"""

from sklearn.tree import DecisionTreeRegressor

from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
classifier = DecisionTreeRegressor(criterion="mse", random_state=2021, splitter="best", max_depth=12)#each parameter is explained in report
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
print(mean_absolute_error(y_test, y_pred))#why accuracy can't be calculated here is also mentioned in report.
print(mean_squared_error(y_test,y_pred))

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

classifier1 = DecisionTreeRegressor(criterion="mae", random_state=2021, splitter="best", max_depth=12)
classifier1.fit(X_train,y_train)
y_pred = classifier1.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test,y_pred))

"""7 Provide a graphical visualization of the tree."""

import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png") 
graph

import graphviz
from sklearn import tree
dot_tree = tree.export_graphviz(classifier1,filled=True,rounded=True)
graph = graphviz.Source(dot_tree, format="png") 
graph

"""8 Prepare confusion matrix and classification report."""

from sklearn import metrics
from sklearn.metrics import classification_report

#print(classification_report(y_test, y_pred))
#cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
#print(cnf_matrix)
#We can't get confusion matrix for this dataset reason explained in report