explainX
diff --git a/‎__pycache__/explain.cpython-37.pyc
348 Bytes b/‎__pycache__/explain.cpython-37.pyc
348 Bytes
diff --git a/‎explain.py
Lines changed: 35 additions & 9 deletions b/‎explain.py
Lines changed: 35 additions & 9 deletions
diff --git a/‎lib/__pycache__/calculate_shap.cpython-37.pyc
1.46 KB b/‎lib/__pycache__/calculate_shap.cpython-37.pyc
1.46 KB
diff --git a/‎lib/__pycache__/plotly_graphs.cpython-37.pyc
0 Bytes b/‎lib/__pycache__/plotly_graphs.cpython-37.pyc
0 Bytes
diff --git a/‎lib/__pycache__/rescale_numeric_feature.cpython-37.pyc
0 Bytes b/‎lib/__pycache__/rescale_numeric_feature.cpython-37.pyc
0 Bytes
diff --git a/‎lib/__pycache__/summary_plot.cpython-37.pyc
0 Bytes b/‎lib/__pycache__/summary_plot.cpython-37.pyc
0 Bytes
diff --git a/‎lib/calculate_shap.py
Lines changed: 96 additions & 11 deletions b/‎lib/calculate_shap.py
Lines changed: 96 additions & 11 deletions
diff --git a/‎lib/plotly_graphs.py
Lines changed: 10 additions & 22 deletions b/‎lib/plotly_graphs.py
Lines changed: 10 additions & 22 deletions
diff --git a/‎lib/rescale_numeric_feature.py
Lines changed: 2 additions & 3 deletions b/‎lib/rescale_numeric_feature.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎lib/summary_plot.py
Lines changed: 5 additions & 7 deletions b/‎lib/summary_plot.py
Lines changed: 5 additions & 7 deletions
@@ -27,26 +27,52 @@ def __init__(self):
         super(explain, self).__init__()
         self.param= None
 
+    # is classification function?
+
+    def is_classification_given_y_array(self, y_test):
+        is_classification = False
+        total = len(y_test)
+        total_unique = len(set(y_test))
+        if total < 30:
+            if total_unique < 10:
+                is_classification = True
+        else:
+            if total_unique < 20:
+                is_classification = True
+        return is_classification
+
 
     def ai(self,  df,  y, model, model_name="xgboost", mode=None):
         y_variable= "y_actual"
         y_variable_predict= "y_prediction"
 
 
+        # is classification?
+        is_classification= self.is_classification_given_y_array(y)
 
-        #shap
-        c = calculate_shap()
-        self.df_final = c.find(model, df, model_name=model_name)
+        # If yes, then different shap functuions are required.
+        # get the shap value based on predcton and make a new dataframe.
 
-        #prediction col
-        if model_name=="xgboost":
-            self.df_final[y_variable_predict] = model.predict(xgboost.DMatrix(df))
+        # find predictions first as shap values need that.
+
+        prediction_col=[]
+
+        if model_name == "xgboost":
+            prediction_col = model.predict(xgboost.DMatrix(df))
 
-        elif model_name=="catboost":
-            self.df_final[y_variable_predict] = model.predict(df.to_numpy())
+        elif model_name == "catboost":
+            prediction_col = model.predict(df.to_numpy())
 
         else:
-            self.df_final[y_variable_predict] = model.predict(df.to_numpy())
+            prediction_col = model.predict(df.to_numpy())
+
+
+        #shap
+        c = calculate_shap()
+        self.df_final = c.find(model, df, prediction_col, is_classification, model_name=model_name)
+
+        #prediction col
+        self.df_final[y_variable_predict] = prediction_col
 
 
 
 
@@ -75,12 +75,13 @@ def catboost_shap(self, model, df, y_variable=None):
 
         return Y
 
+
     def kernel_shap(self, model, X_train):
         # use Kernel SHAP to explain test set predictions
         explainer = shap.KernelExplainer(model.predict_proba, X_train)
         shap_values = explainer.shap_values(X_train, nsamples=100)
 
-        pd_shap = pd.DataFrame(np.concatenate(shap_values))
+        pd_shap = pd.DataFrame(shap_values)
         all_columns = list(X_train.columns)
 
         shap_columns = []
@@ -97,18 +98,79 @@ def kernel_shap(self, model, X_train):
 
         return Y
 
+    def kernel_shap_classification(self, model, X_train,prediction_col):
+        # use Kernel SHAP to explain test set predictions
+        explainer = shap.KernelExplainer(model.predict_proba, X_train)
+        shap_values = explainer.shap_values(X_train, nsamples=100)
+
+        pd_shap = self.select_row_shap_values(shap_values, prediction_col)
+        all_columns = list(X_train.columns)
+
+        shap_columns = []
+
+        for i in all_columns:
+            shap_columns.append(i + "_impact")
+        pd_shap.columns = shap_columns
+
+
+
+        Y = X_train.copy()
+        for c in shap_columns:
+            Y[c] = list(pd_shap[c])
+
+        return Y
+
+    def select_row_shap_values(self, shap_values,prediction_col):
+
+        num_of_classes = len(shap_values)
+
+        if num_of_classes== len(prediction_col):
+            df_final = pd.DataFrame(shap_values)
+            return df_final
+
+        point_no=0
+        df_array = []
+        for p in prediction_col:
+            df_array.append(shap_values[p][point_no])
+            point_no=point_no+1
+
+        df_final = pd.DataFrame(df_array)
+        return df_final
+
+
+    def randomforest_shap_classification(self, model, X,prediction_col):
+        explainer = shap.TreeExplainer(model)
+        shap_values = explainer.shap_values(X)
+
+
+        pd_shap = self.select_row_shap_values(shap_values,prediction_col)
+        all_columns = list(X.columns)
+
+
+        pd_shap.columns = [f"{y}_impact" for y in all_columns]
+
+        shap_columns = pd_shap.columns
+
+        Y = X.copy()
+        for c in shap_columns:
+            Y[c] = list(pd_shap[c])
+
+
+        return Y
+
 
     def randomforest_shap(self, model, X):
         explainer = shap.TreeExplainer(model)
         shap_values = explainer.shap_values(X)
 
-        pd_shap = pd.DataFrame(np.concatenate(shap_values))
+
+        pd_shap = pd.DataFrame(shap_values)
         all_columns = list(X.columns)
 
 
         pd_shap.columns = [f"{y}_impact" for y in all_columns]
 
-        shap_columns = shap_values.columns
+        shap_columns = pd_shap.columns
 
         Y = X.copy()
         for c in shap_columns:
@@ -130,7 +192,7 @@ def get_shap_values(self, x_array, model, x_variable, cat_index):
         shap_values = pd.DataFrame(data=shap_values, columns=total_columns)
         return shap_values
 
-    def find(self, model, df, model_name="xgboost"):
+    def find(self, model, df,prediction_col,is_classification, model_name="xgboost"):
 
         if model_name == "xgboost":
             df2 = self.xgboost_shap(model, df)
@@ -147,37 +209,60 @@ def find(self, model, df, model_name="xgboost"):
 
 
         elif model_name == "randomforest":
-            df2 = self.randomforest_shap(model, df)
+            if is_classification:
+                df2 = self.randomforest_shap_classification(model, df, prediction_col)
+            else:
+                df2 = self.randomforest_shap(model, df)
             return df2
 
         elif model_name == "svm":
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
 
         elif model_name == "knn":
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
 
         elif model_name == "logisticregression":
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
 
         elif model_name == "decisiontree":
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
 
         elif model_name == "neuralnetwork":
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
+
         elif model_name=="gradientboostingregressor":
             df2 = self.xgboost_shap(model, df)
             return df2
         elif "gradientboosting" in model_name:
             df2 = self.xgboost_shap(model, df)
             return df2
         else:
-            df2 = self.kernel_shap(model, df)
+            if is_classification:
+                df2 = self.kernel_shap_classification(model, df,prediction_col)
+            else:
+                df2 = self.kernel_shap(model, df)
             return df2
 
 
 
+
@@ -4,17 +4,16 @@
 from shap_pdp import *
 from summary_plot import *
 from data_for_shap_graphs import *
-import plotly.graph_objects as go
-
 
 class plotly_graphs():
     def __init__(self):
         super(plotly_graphs, self).__init__()
-        self.data = data_for_shap_graphs()
+        self.data= data_for_shap_graphs()
 
         # save all important variables here.
 
-    def feature_importance(self, df):
+
+    def feature_importance(self,  df):
         df2 = self.data.feature_importance(df)
 
         names = list(df2["VariableName"])
@@ -25,8 +24,7 @@ def feature_importance(self, df):
 
         df2["VariableName"] = new_names
 
-        feature_importance = px.bar(df2, x='Impact_Value', y="VariableName", orientation='h',
-                                    title='Feature Importance', )
+        feature_importance = px.bar(df2, x='Impact_Value', y="VariableName", orientation='h', title='Feature Importance',)
         return feature_importance, df2
 
     def feature_impact(self, df):
@@ -47,31 +45,21 @@ def feature_impact(self, df):
 
     def summary_plot(self, df):
         df2 = self.data.summary_plot(df)
-        # summary_plot = go.Figure()
-       
 
-        # summary_plot.add_trace(go.Scattergl(x=df2['xaxis'], y=df2['yaxis'],
-        #                                     mode='markers',hovertext=df2['hover'],
-        #                                     marker=dict(color=list(df2['color']),showscale=True,autocolorscale=False,
-        #                                                 cauto=False)))
         summary_plot = px.scatter(df2, x="xaxis", y="yaxis", color="color", hover_data=["hover"])
 
         return summary_plot, df2
 
     def partial_dependence_plot(self, df, v1, v2, v3):
         pdp = shap_pdp()
         df = pdp.find(df)
-        g = go.Figure()
-        print("new")
-        # g.add_trace(go.Scattergl(x=df[v1], y=df[v2], mode='markers',showlegend=True,
-        #                                     marker=dict(color=list(df[v3]),
-        #                                                 autocolorscale=False,
-        #                                                 cauto=False
-        #                                                 )))
-
-        g = px.scatter(df, x=v1, y=v2, color=v3)
+        g= px.scatter(df, x=v1, y=v2, color=v3)
         return g, df
 
+
     def distributions(self, df, variable_name):
-        graph = px.histogram(df, x=variable_name, marginal="box")
+        graph= px.histogram(df, x=variable_name, marginal="box")
         return graph
+
+
+
@@ -99,10 +99,9 @@ def add_col_rescaled(self, df):
 
         for nc in numeric_columns:
             # get min and max
-            if nc in df_describe:
-                mini, maxi = self.get_min_max(df_describe, nc)
+            mini, maxi = self.get_min_max(df_describe, nc)
 
-                df[nc + "_rescaled"] = (df[nc] - mini) / (maxi - mini) * 10
+            df[nc + "_rescaled"] = (df[nc] - mini) / (maxi - mini) * 10
 
         for cc in categorical_columns:
             df[cc + "_rescaled"] = 0
 
@@ -33,13 +33,11 @@ def rearrange_dataframe(self, df_re ):
         df_final = pd.DataFrame()
 
         for v in self.original_columns:
-            try:
-                df_single = df_re[[v, v + '_rescaled', v + '_impact']]
-                df_single["variable_name"] = v
-                df_single.columns = ['hover', 'color', 'xaxis', 'yaxis']
-                df_final = pd.concat([df_final, df_single])
-            except Exception as e:
-                pass
+            df_single = df_re[[v, v + '_rescaled', v + '_impact']]
+            df_single["variable_name"] = v
+            df_single.columns = ['hover', 'color', 'xaxis', 'yaxis']
+            df_final = pd.concat([df_final, df_single])
+
         return df_final