hackathon.py

# -*- coding: utf-8 -*-
"""Hackathon.ipynb

Automatically generated by Colaboratory.

"""

#@title Set up Boto credentials to pull data from S3
 import boto3  
 import botocore  
 BUCKET_NAME = 'amazing-bucket-am-1' # replace with your bucket name
 
 # enter authentication credentials
 #  Credentials for your AWS account can be found in the IAM Console. You can create or use an existing user. Go to manage access keys and generate a new set of keys.
 s3 = boto3.resource('s3', aws_access_key_id = 'ENTER YOUR ACCESS KEY', aws_secret_access_key= 'ENTER YOUR SECRET KEY')

#@title Download "training.csv" from S3
 KEY = 'hackathon/training.csv' # replace with your object key  

 try:  
   # we are trying to download training set from s3 with name `training.csv` to colab dir with name `training.csv`  
   s3.Bucket(BUCKET_NAME).download_file(KEY, 'training.csv')  
 except botocore.exceptions.ClientError as e:  
   if e.response['Error']['Code'] == "404":  
     print("The object does not exist.")  
   else:  
     raise

#@title Download "test.csv" from S3
 KEY = 'hackathon/test.csv' # replace with your object key  

 try:  
   # we are trying to download test set from s3 with name `test.csv` to colab dir with name `test.csv`  
   s3.Bucket(BUCKET_NAME).download_file(KEY, 'test.csv')  
 except botocore.exceptions.ClientError as e:  
   if e.response['Error']['Code'] == "404":  
     print("The object does not exist.")  
   else:  
     raise

#@title or Upload the provided file "Training.csv".
from google.colab import files
files.upload()

#@title or Upload the provided file "Test.csv".
from google.colab import files
files.upload()

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
# %matplotlib inline
import missingno as msno

#@title Loads the data.

from IPython import display #display and printing helpers
import pandas #Pandas (http://pandas.pydata.org) is a fast, powerful and popular open source data analysis library.
import numpy as np
from IPython.display import HTML
training_data = pandas.read_csv(
    "training.csv",
   encoding='utf-8')

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

sns.set(style='white', context='notebook', palette='deep')

training_data.columns = ['age','workClass','fnlwgt','education','education_num','maritial_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'over50k']

#So No Null values in any dataset
training_data.isnull().sum()

# Peek at data
training_data.head(4)

# Reformat Column We Are Predicting
training_data['over50k']=training_data['over50k'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
training_data.head(4)

# Identify Numeric features
numeric_features = ['age','fnlwgt','education_num','capital_gain','capital_loss','hours_per_week','over50k']

# Identify Categorical features
cat_features = ['workclass','education','marital_status', 'occupation', 'relationship', 'race', 'sex', 'native']

# Count of >50K & <=50K
sns.countplot(training_data['over50k'],label="Count")
#data has more of under 50k compared to over 50k

#No need to do downsample/ unsampling

# Correlation matrix between numerical values
sns.heatmap(training_data[numeric_features].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

# Explore Education Num vs Income
# More the education num more chances to be higher salary
g = sns.factorplot(x="education_num",y="over50k",data=training_data,kind="bar",size = 6,palette = "muted")
g.despine(left=True)
g = g.set_ylabels(">50K probability")

# Explore Hours Per Week vs Income
g  = sns.factorplot(x="hours_per_week",y="over50k",data=training_data,kind="bar",size = 6,palette = "muted")
g.despine(left=True)
g = g.set_ylabels(">50K probability")

# Explore Age vs Income
# Salary < 50k is right skewed
g = sns.FacetGrid(training_data, col='over50k')
g = g.map(sns.distplot, "age")

# Fill Missing Category Entries
training_data["workClass"] = training_data["workClass"].fillna("X")
training_data["occupation"] = training_data["occupation"].fillna("X")
training_data["country"] = training_data["country"].fillna("United-States")

# Confirm All Missing Data is Handled
training_data.isnull().sum()

# Explore Native Nation vs Income
g = sns.barplot(x="country",y="over50k",data=training_data)
g = g.set_ylabel("Income >50K Probability")

# Explore Sex vs Income
g = sns.barplot(x="sex",y="over50k",data=training_data)
g = g.set_ylabel("Income >50K Probability")

# Explore Relationship vs Income
g = sns.factorplot(x="relationship",y="over50k",data=training_data,kind="bar", size = 6 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")

# Explore Marital Status vs Income
g = sns.factorplot(x="maritial_status",y="over50k",data=training_data,kind="bar", size = 6 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")

# Explore Workclass vs Income
g = sns.factorplot(x="workClass",y="over50k",data=training_data,kind="bar", size = 6 ,
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("Income >50K Probability")

#@title Load the training data.

from IPython import display #display and printing helpers
import pandas as pd #Pandas (http://pandas.pydata.org) is a fast, powerful and popular open source data analysis library.
import numpy as np
from IPython.display import HTML
train = pd.read_csv(
    "training.csv",
    na_values = '?',
   encoding='utf-8')

#@title Renaming column names in train
train.columns = ['Age','Workclass','fnlgwt','Education','Education Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss','Hours/Week','Native Country','Target']

#@title Load the test data.

from IPython import display #display and printing helpers
import pandas as pd #Pandas (http://pandas.pydata.org) is a fast, powerful and popular open source data analysis library.
import numpy as np
from IPython.display import HTML
test_data = pd.read_csv(
    "test.csv",
    na_values = '?',
   encoding='utf-8')

#@title Renaming column names in test_data
test_data.columns = ['Age','Workclass','fnlgwt','Education','Education Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss','Hours/Week','Native Country','Target']

#@title Checking the info of each variables
train.info()

#@title  Filling NAs with mode of the column

modes = train.mode().iloc[0]
train.fillna(modes, inplace=True)

#@title Checking categorical variables that needs to be encoded
cols_to_encode = [train.columns[i] for i in range(train.shape[1]) if train.dtypes[i] == np.object]
cols_to_encode

#@title Checking if columns Education and Education Num provide same data
train.groupby('Education').nunique()['Education Num']

#@title Dropping Education column as it is already encoded with Education Num

train.drop('Education', axis = 1, inplace = True)
cols_to_encode.remove('Education')
train.head()

#@title Creating dummy variables for categorical variables

train = pd.get_dummies(train, drop_first = True)
train.head()

#@title Creating Cross Validation (with k = 5) and creating empty lists where model data can be stored.

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, cross_val_predict
cv = KFold(5, random_state = 1)
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
                             classification_report, confusion_matrix)
Model = []
Accuracy = []
Precision = []
Recall = []
F1 = []
AUC = []

#@title Splitting the training dataset into X and y and scaling the data.

x = train[train.columns[:-1]]
y = train[train.columns[-1]]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x)
x_scaled = scaler.transform(x)

#@title Base Model

from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy = 'most_frequent',random_state = 1)
Model.append("Dummy")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
param_grid = {'C': [0.1, 0.4, 0.7]}
grid1 = GridSearchCV(lr, param_grid, cv=cv).fit(x_scaled, y)
print("Grid Logistic Regression: ", grid1.best_score_, grid1.best_params_)

#@title Appending results from the best searched in Grid
from sklearn.linear_model import LogisticRegression
clf = grid1.best_estimator_
Model.append("Logistic Regression")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title SVM Classifer

from sklearn.svm import SVC
svc = SVC()
param_grid = {'C': [0.1, 0.4, 0.7],
              'kernel': ['linear']}
grid1 = GridSearchCV(svc, param_grid, cv=cv).fit(x_scaled, y)
print("Grid SVC: ", grid1.best_score_, grid1.best_params_)

#@title Appending results from the best searched in Grid

from sklearn.svm import SVC
clf = grid1.best_estimator_
Model.append("SVC")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
param_grid = {'max_depth': [100, 400, None],
              'criterion': ['gini','entropy']}
grid1 = GridSearchCV(dtc, param_grid, cv=cv).fit(x_scaled, y)
print("Grid DTC: ", grid1.best_score_, grid1.best_params_)

#@title Appending results from the best searched in Grid

from sklearn.tree import DecisionTreeClassifier
clf = grid1.best_estimator_
Model.append("Decision Tree Classifier")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title Random Forest Classifier
#finding the optimum number of trees first in Random Forest

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)
param_grid = {'n_estimators': [10,40,70,100,400,700,1000]}
grid1 = GridSearchCV(rfc, param_grid, cv=cv).fit(x_scaled, y)
print("Grid RFC: ", grid1.best_score_, grid1.best_params_)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)
param_grid = {'n_estimators': [350,400,450]}
grid2 = GridSearchCV(rfc, param_grid, cv=cv).fit(x_scaled, y)
print("Grid RFC: ", grid2.best_score_, grid2.best_params_)

from sklearn.ensemble import RandomForestClassifier
rfc = grid2.best_estimator_
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': [5, 10, 15, 20, 25, 30, None]}
grid3 = GridSearchCV(rfc, param_grid, cv=cv).fit(x_scaled, y)
print("Grid RFC: ", grid3.best_score_, grid3.best_params_)

#@title Appending results from the best searched in Grid

from sklearn.ensemble import RandomForestClassifier
clf = grid3.best_estimator_
Model.append("Random Forest Classifier")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title Neural Network Classifier

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=1)
param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
              'activation': ['tanh', 'relu'],
              'solver': ['sgd', 'adam'],
              'alpha': [0.0001, 0.05],
              'learning_rate': ['constant','adaptive']}
grid1 = GridSearchCV(mlp, param_grid, cv=cv).fit(x_scaled, y)
print("Grid MLP: ", grid1.best_score_, grid1.best_params_)

#@title Appending results from the best searched in Grid

from sklearn.neural_network import MLPClassifier
clf = grid1.best_estimator_
Model.append("MLP Classifier")
Accuracy.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='accuracy').mean())
Precision.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='precision').mean())
Recall.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='recall').mean())
F1.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='f1').mean())
AUC.append(cross_val_score(clf, x_scaled, y, cv=cv, scoring='roc_auc').mean())

#@title Evaluating performance of different models

evaluation = pd.DataFrame({'Model': Model, 
                           'Accuracy': Accuracy, 
                           'Precision': Precision, 
                           'Recall': Recall,
                           'F1 Score': F1, 
                           'AUC': AUC})
print("FOLLOWING ARE THE TRAINING SCORES: ")
evaluation

#@title Treating test dataset - Drop Education, Fill na with Mode, Get Dummies for categorical variables

test_data.drop('Education', axis = 1, inplace = True)
test_data.fillna(modes, inplace=True)
test_data = pd.get_dummies(test_data, drop_first = True)
missing_cols = set(train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0

test_data = test_data[train.columns]

#@title Scaling test dataset

x_test = scaler.transform(test_data[test_data.columns[:-1]])
y_test = test_data[test_data.columns[-1]].values

#@title Applying Random Forest Classifier on the test dataset

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 1, criterion = 'gini', 
                             max_depth = 20, n_estimators = 400)
print("Test Accuracy:",cross_val_score(clf, x_test, y_test, cv=cv, scoring='accuracy').mean())
print("Test Precision:",cross_val_score(clf, x_test, y_test, cv=cv, scoring='precision').mean())
print("Test Recall:",cross_val_score(clf, x_test, y_test, cv=cv, scoring='recall').mean())
print("Test F1 Score:",cross_val_score(clf, x_test, y_test, cv=cv, scoring='f1').mean())
print("Test AUC:",cross_val_score(clf, x_test, y_test, cv=cv, scoring='roc_auc').mean())