Skip to content

Commit 238dbe7

Browse files
committed
problems with randomforest model fixed
1 parent d2dcaf9 commit 238dbe7

11 files changed

+149
-53
lines changed

__pycache__/explain.cpython-37.pyc

348 Bytes
Binary file not shown.

explain.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,52 @@ def __init__(self):
2727
super(explain, self).__init__()
2828
self.param= None
2929

30+
# is classification function?
31+
32+
def is_classification_given_y_array(self, y_test):
33+
is_classification = False
34+
total = len(y_test)
35+
total_unique = len(set(y_test))
36+
if total < 30:
37+
if total_unique < 10:
38+
is_classification = True
39+
else:
40+
if total_unique < 20:
41+
is_classification = True
42+
return is_classification
43+
3044

3145
def ai(self, df, y, model, model_name="xgboost", mode=None):
3246
y_variable= "y_actual"
3347
y_variable_predict= "y_prediction"
3448

3549

50+
# is classification?
51+
is_classification= self.is_classification_given_y_array(y)
3652

37-
#shap
38-
c = calculate_shap()
39-
self.df_final = c.find(model, df, model_name=model_name)
53+
# If yes, then different shap functuions are required.
54+
# get the shap value based on predcton and make a new dataframe.
4055

41-
#prediction col
42-
if model_name=="xgboost":
43-
self.df_final[y_variable_predict] = model.predict(xgboost.DMatrix(df))
56+
# find predictions first as shap values need that.
57+
58+
prediction_col=[]
59+
60+
if model_name == "xgboost":
61+
prediction_col = model.predict(xgboost.DMatrix(df))
4462

45-
elif model_name=="catboost":
46-
self.df_final[y_variable_predict] = model.predict(df.to_numpy())
63+
elif model_name == "catboost":
64+
prediction_col = model.predict(df.to_numpy())
4765

4866
else:
49-
self.df_final[y_variable_predict] = model.predict(df.to_numpy())
67+
prediction_col = model.predict(df.to_numpy())
68+
69+
70+
#shap
71+
c = calculate_shap()
72+
self.df_final = c.find(model, df, prediction_col, is_classification, model_name=model_name)
73+
74+
#prediction col
75+
self.df_final[y_variable_predict] = prediction_col
5076

5177

5278

1.46 KB
Binary file not shown.
0 Bytes
Binary file not shown.
Binary file not shown.
0 Bytes
Binary file not shown.

lib/calculate_shap.py

Lines changed: 96 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,13 @@ def catboost_shap(self, model, df, y_variable=None):
7575

7676
return Y
7777

78+
7879
def kernel_shap(self, model, X_train):
7980
# use Kernel SHAP to explain test set predictions
8081
explainer = shap.KernelExplainer(model.predict_proba, X_train)
8182
shap_values = explainer.shap_values(X_train, nsamples=100)
8283

83-
pd_shap = pd.DataFrame(np.concatenate(shap_values))
84+
pd_shap = pd.DataFrame(shap_values)
8485
all_columns = list(X_train.columns)
8586

8687
shap_columns = []
@@ -97,18 +98,79 @@ def kernel_shap(self, model, X_train):
9798

9899
return Y
99100

101+
def kernel_shap_classification(self, model, X_train,prediction_col):
102+
# use Kernel SHAP to explain test set predictions
103+
explainer = shap.KernelExplainer(model.predict_proba, X_train)
104+
shap_values = explainer.shap_values(X_train, nsamples=100)
105+
106+
pd_shap = self.select_row_shap_values(shap_values, prediction_col)
107+
all_columns = list(X_train.columns)
108+
109+
shap_columns = []
110+
111+
for i in all_columns:
112+
shap_columns.append(i + "_impact")
113+
pd_shap.columns = shap_columns
114+
115+
116+
117+
Y = X_train.copy()
118+
for c in shap_columns:
119+
Y[c] = list(pd_shap[c])
120+
121+
return Y
122+
123+
def select_row_shap_values(self, shap_values,prediction_col):
124+
125+
num_of_classes = len(shap_values)
126+
127+
if num_of_classes== len(prediction_col):
128+
df_final = pd.DataFrame(shap_values)
129+
return df_final
130+
131+
point_no=0
132+
df_array = []
133+
for p in prediction_col:
134+
df_array.append(shap_values[p][point_no])
135+
point_no=point_no+1
136+
137+
df_final = pd.DataFrame(df_array)
138+
return df_final
139+
140+
141+
def randomforest_shap_classification(self, model, X,prediction_col):
142+
explainer = shap.TreeExplainer(model)
143+
shap_values = explainer.shap_values(X)
144+
145+
146+
pd_shap = self.select_row_shap_values(shap_values,prediction_col)
147+
all_columns = list(X.columns)
148+
149+
150+
pd_shap.columns = [f"{y}_impact" for y in all_columns]
151+
152+
shap_columns = pd_shap.columns
153+
154+
Y = X.copy()
155+
for c in shap_columns:
156+
Y[c] = list(pd_shap[c])
157+
158+
159+
return Y
160+
100161

101162
def randomforest_shap(self, model, X):
102163
explainer = shap.TreeExplainer(model)
103164
shap_values = explainer.shap_values(X)
104165

105-
pd_shap = pd.DataFrame(np.concatenate(shap_values))
166+
167+
pd_shap = pd.DataFrame(shap_values)
106168
all_columns = list(X.columns)
107169

108170

109171
pd_shap.columns = [f"{y}_impact" for y in all_columns]
110172

111-
shap_columns = shap_values.columns
173+
shap_columns = pd_shap.columns
112174

113175
Y = X.copy()
114176
for c in shap_columns:
@@ -130,7 +192,7 @@ def get_shap_values(self, x_array, model, x_variable, cat_index):
130192
shap_values = pd.DataFrame(data=shap_values, columns=total_columns)
131193
return shap_values
132194

133-
def find(self, model, df, model_name="xgboost"):
195+
def find(self, model, df,prediction_col,is_classification, model_name="xgboost"):
134196

135197
if model_name == "xgboost":
136198
df2 = self.xgboost_shap(model, df)
@@ -147,37 +209,60 @@ def find(self, model, df, model_name="xgboost"):
147209

148210

149211
elif model_name == "randomforest":
150-
df2 = self.randomforest_shap(model, df)
212+
if is_classification:
213+
df2 = self.randomforest_shap_classification(model, df, prediction_col)
214+
else:
215+
df2 = self.randomforest_shap(model, df)
151216
return df2
152217

153218
elif model_name == "svm":
154-
df2 = self.kernel_shap(model, df)
219+
if is_classification:
220+
df2 = self.kernel_shap_classification(model, df,prediction_col)
221+
else:
222+
df2 = self.kernel_shap(model, df)
155223
return df2
156224

157225
elif model_name == "knn":
158-
df2 = self.kernel_shap(model, df)
226+
if is_classification:
227+
df2 = self.kernel_shap_classification(model, df,prediction_col)
228+
else:
229+
df2 = self.kernel_shap(model, df)
159230
return df2
160231

161232
elif model_name == "logisticregression":
162-
df2 = self.kernel_shap(model, df)
233+
if is_classification:
234+
df2 = self.kernel_shap_classification(model, df,prediction_col)
235+
else:
236+
df2 = self.kernel_shap(model, df)
163237
return df2
164238

165239
elif model_name == "decisiontree":
166-
df2 = self.kernel_shap(model, df)
240+
if is_classification:
241+
df2 = self.kernel_shap_classification(model, df,prediction_col)
242+
else:
243+
df2 = self.kernel_shap(model, df)
167244
return df2
168245

169246
elif model_name == "neuralnetwork":
170-
df2 = self.kernel_shap(model, df)
247+
if is_classification:
248+
df2 = self.kernel_shap_classification(model, df,prediction_col)
249+
else:
250+
df2 = self.kernel_shap(model, df)
171251
return df2
252+
172253
elif model_name=="gradientboostingregressor":
173254
df2 = self.xgboost_shap(model, df)
174255
return df2
175256
elif "gradientboosting" in model_name:
176257
df2 = self.xgboost_shap(model, df)
177258
return df2
178259
else:
179-
df2 = self.kernel_shap(model, df)
260+
if is_classification:
261+
df2 = self.kernel_shap_classification(model, df,prediction_col)
262+
else:
263+
df2 = self.kernel_shap(model, df)
180264
return df2
181265

182266

183267

268+

lib/plotly_graphs.py

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,16 @@
44
from shap_pdp import *
55
from summary_plot import *
66
from data_for_shap_graphs import *
7-
import plotly.graph_objects as go
8-
97

108
class plotly_graphs():
119
def __init__(self):
1210
super(plotly_graphs, self).__init__()
13-
self.data = data_for_shap_graphs()
11+
self.data= data_for_shap_graphs()
1412

1513
# save all important variables here.
1614

17-
def feature_importance(self, df):
15+
16+
def feature_importance(self, df):
1817
df2 = self.data.feature_importance(df)
1918

2019
names = list(df2["VariableName"])
@@ -25,8 +24,7 @@ def feature_importance(self, df):
2524

2625
df2["VariableName"] = new_names
2726

28-
feature_importance = px.bar(df2, x='Impact_Value', y="VariableName", orientation='h',
29-
title='Feature Importance', )
27+
feature_importance = px.bar(df2, x='Impact_Value', y="VariableName", orientation='h', title='Feature Importance',)
3028
return feature_importance, df2
3129

3230
def feature_impact(self, df):
@@ -47,31 +45,21 @@ def feature_impact(self, df):
4745

4846
def summary_plot(self, df):
4947
df2 = self.data.summary_plot(df)
50-
# summary_plot = go.Figure()
51-
5248

53-
# summary_plot.add_trace(go.Scattergl(x=df2['xaxis'], y=df2['yaxis'],
54-
# mode='markers',hovertext=df2['hover'],
55-
# marker=dict(color=list(df2['color']),showscale=True,autocolorscale=False,
56-
# cauto=False)))
5749
summary_plot = px.scatter(df2, x="xaxis", y="yaxis", color="color", hover_data=["hover"])
5850

5951
return summary_plot, df2
6052

6153
def partial_dependence_plot(self, df, v1, v2, v3):
6254
pdp = shap_pdp()
6355
df = pdp.find(df)
64-
g = go.Figure()
65-
print("new")
66-
# g.add_trace(go.Scattergl(x=df[v1], y=df[v2], mode='markers',showlegend=True,
67-
# marker=dict(color=list(df[v3]),
68-
# autocolorscale=False,
69-
# cauto=False
70-
# )))
71-
72-
g = px.scatter(df, x=v1, y=v2, color=v3)
56+
g= px.scatter(df, x=v1, y=v2, color=v3)
7357
return g, df
7458

59+
7560
def distributions(self, df, variable_name):
76-
graph = px.histogram(df, x=variable_name, marginal="box")
61+
graph= px.histogram(df, x=variable_name, marginal="box")
7762
return graph
63+
64+
65+

lib/rescale_numeric_feature.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,9 @@ def add_col_rescaled(self, df):
9999

100100
for nc in numeric_columns:
101101
# get min and max
102-
if nc in df_describe:
103-
mini, maxi = self.get_min_max(df_describe, nc)
102+
mini, maxi = self.get_min_max(df_describe, nc)
104103

105-
df[nc + "_rescaled"] = (df[nc] - mini) / (maxi - mini) * 10
104+
df[nc + "_rescaled"] = (df[nc] - mini) / (maxi - mini) * 10
106105

107106
for cc in categorical_columns:
108107
df[cc + "_rescaled"] = 0

lib/summary_plot.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,11 @@ def rearrange_dataframe(self, df_re ):
3333
df_final = pd.DataFrame()
3434

3535
for v in self.original_columns:
36-
try:
37-
df_single = df_re[[v, v + '_rescaled', v + '_impact']]
38-
df_single["variable_name"] = v
39-
df_single.columns = ['hover', 'color', 'xaxis', 'yaxis']
40-
df_final = pd.concat([df_final, df_single])
41-
except Exception as e:
42-
pass
36+
df_single = df_re[[v, v + '_rescaled', v + '_impact']]
37+
df_single["variable_name"] = v
38+
df_single.columns = ['hover', 'color', 'xaxis', 'yaxis']
39+
df_final = pd.concat([df_final, df_single])
40+
4341
return df_final
4442

4543

0 commit comments

Comments
 (0)