Multiple corrections and code completion (#98)

* Multiple corrections and code completion - Display Bug for autoML - corrected - Adding to_python for Pipelines - to_python is now available for NaiveBayes and OneHotEncoder - correction to_sql for OneHotEncoder - correction of read_csv for headers with space - correction of outlier_plot * to_python correction - correcting to_python - adding tests to_python * auc value correction * Bug + simplification Test simplification Correction bug model selection * Update test_onehotencoder.py * Small corrections
vertica · May 4, 2021 · c4e8b74 · c4e8b74
1 parent e3f3040
commit c4e8b74
Show file tree

Hide file tree

Showing 23 changed files with 318 additions and 71 deletions.
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 setuptools.setup(
     name = 'verticapy',  
-    version = '0.6.0',
+    version = '0.6.1',
     author = "Badr Ouali",
     author_email = "[email protected]",
     url = "https://github.com/vertica/VerticaPy",

diff --git a/verticapy/__init__.py b/verticapy/__init__.py
@@ -46,7 +46,7 @@
 # of moving data around for processing, VerticaPy brings the logic to the data.
 #
 #
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 __author__ = "Badr Ouali"
 __author_email__ = "[email protected]"
 __description__ = """VerticaPy simplifies data exploration, data cleaning and machine learning in Vertica."""

diff --git a/verticapy/learn/metrics.py b/verticapy/learn/metrics.py
@@ -688,7 +688,7 @@ def accuracy_score(
     y_score: str,
     input_relation: (str, vDataFrame),
     cursor=None,
-    pos_label: (int, float, str) = 1,
+    pos_label: (int, float, str) = None,
 ):
     """
 ---------------------------------------------------------------------------

diff --git a/verticapy/learn/mlplot.py b/verticapy/learn/mlplot.py
@@ -461,7 +461,7 @@ def plot_importance(
     if print_legend:
         orange = mpatches.Patch(color=color_dict(style_kwds, 1), label="sign -")
         blue = mpatches.Patch(color=color_dict(style_kwds, 0), label="sign +")
-        ax.legend(handles=[orange, blue], loc="center left", bbox_to_anchor=[1, 0.5])
+        ax.legend(handles=[blue, orange,], loc="center left", bbox_to_anchor=[1, 0.5])
         box = ax.get_position()
         ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
     ax.set_ylabel("Features")
@@ -610,28 +610,34 @@ def plot_bubble_ml(x: list, y: list, s: list = None, z: list = [], x_label: str
         ax.spines['bottom'].set_position('center')
         ax.spines['right'].set_color('none')
         ax.spines['top'].set_color('none')
-        plt.text(max(x) + 0.1, max(y) + 0.1, 
+        delta_x = (max(x) - min(x)) * 0.1
+        delta_y = (max(y) - min(y)) * 0.1
+        plt.text(max(x) + delta_x if reverse[0] else min(x) - delta_x, 
+                 max(y) + delta_y if reverse[1] else min(y) - delta_y, 
                  "Modest", size=15, rotation=130.,
                  ha="center", va="center",
                  bbox=dict(boxstyle="round",
                            ec=gen_colors()[0],
                            fc=gen_colors()[0],
                            alpha=0.3),)
-        plt.text(max(x) + 0.1, min(y) - 0.1, 
+        plt.text(max(x) + delta_x if reverse[0] else min(x) - delta_x, 
+                 min(y) - delta_y if reverse[1] else max(y) + delta_y, 
                  "Efficient", size=15, rotation=30.,
                  ha="center", va="center",
                  bbox=dict(boxstyle="round",
                            ec=gen_colors()[1],
                            fc=gen_colors()[1],
                            alpha=0.3),)
-        plt.text(min(x) - 0.1, max(y) + 0.1, 
+        plt.text(min(x) - delta_x if reverse[0] else max(x) + delta_x, 
+                 max(y) + delta_y if reverse[1] else min(y) - delta_y, 
                  "Performant", size=15, rotation=-130.,
                  ha="center", va="center",
                  bbox=dict(boxstyle="round",
                            ec=gen_colors()[2],
                            fc=gen_colors()[2],
                            alpha=0.3),)
-        plt.text(min(x) - 0.1, min(y) - 0.1, 
+        plt.text(min(x) - delta_x if reverse[0] else max(x) + delta_x, 
+                 min(y) - delta_y if reverse[1] else max(y) + delta_y, 
                  "Performant & Efficient", size=15, rotation=-30.,
                  ha="center", va="center",
                  bbox=dict(boxstyle="round",

diff --git a/verticapy/learn/model_selection.py b/verticapy/learn/model_selection.py
@@ -207,6 +207,12 @@ def bayesian_search_cv(
             else:
                 result[item] += [None]
     result["score"] = param_gs["avg_score"]
+    if 'max_features' in result:
+        for idx, elem in enumerate(result["max_features"]):
+            if elem == "auto":
+                result["max_features"][idx] = int(np.floor(np.sqrt(len(X))) + 1)
+            elif elem == "max":
+                result["max_features"][idx] = int(len(X))
     result = tablesample(result).to_sql()
     if isinstance(input_relation, str):
         schema, relation = schema_relation(input_relation)
@@ -1016,11 +1022,11 @@ def gen_params_grid(estimator,
                 params_grid["sample"] = [0.7,]
                 params_grid["n_estimators"] = [20,]
         elif optimized_grid == -666:
-            result = {"max_features": {"type": int, "range": [1, len(all_params),], "nbins": nbins,},
+            result = {"max_features": {"type": int, "range": [1, max_nfeatures,], "nbins": nbins,},
                       "max_leaf_nodes": {"type": int, "range": [32, 1e9,], "nbins": nbins,},
                       "max_depth": {"type": int, "range": [2, 30,], "nbins": nbins,},
                       "min_samples_leaf": {"type": int, "range": [1, 15,], "nbins": nbins,},
-                      "min_samples_leaf": {"type": float, "range": [0.0, 0.1,], "nbins": nbins,},
+                      "min_info_gain": {"type": float, "range": [0.0, 0.1,], "nbins": nbins,},
                       "nbins": {"type": int, "range": [10, 1000,], "nbins": nbins,},}
             if isinstance(RandomForestRegressor, RandomForestClassifier,):
                 result["sample"] = {"type": float, "range": [0.1, 1.0,], "nbins": nbins,}

diff --git a/verticapy/learn/neighbors.py b/verticapy/learn/neighbors.py
@@ -485,7 +485,7 @@ def score(
                     pos_label=None,
                 )
             else:
-                return accuracy_score(y_true, y_score, input_relation, self.cursor)
+                return accuracy_score(y_true, y_score, input_relation, self.cursor, pos_label=pos_label,)
         elif method == "auc":
             return auc(y_true, y_proba, input_relation, self.cursor)
         elif method == "prc_auc":

diff --git a/verticapy/learn/pipeline.py b/verticapy/learn/pipeline.py
@@ -176,6 +176,7 @@ def fit(
             self.test_relation = self.steps[-1][1].test_relation
         except:
             pass
+        return self
 
     # ---#
     def get_params(self):
@@ -409,6 +410,53 @@ def set_params(self, parameters: dict = {}):
                 if param.lower() == step[0].lower():
                     step[1].set_params(parameters[param])
 
+    # ---#
+    def to_python(self, 
+                  name: str = "predict", 
+                  return_proba: bool = False, 
+                  return_distance_clusters: bool = False, 
+                  return_str: bool = False,):
+        """
+    ---------------------------------------------------------------------------
+    Returns the Python code needed to deploy the pipeline without using built-in
+    Vertica functions.
+
+    Parameters
+    ----------
+    name: str, optional
+        Function Name.
+    return_proba: bool, optional
+        If set to True and the model is a classifier, the function will return 
+        the model probabilities.
+    return_distance_clusters: bool, optional
+        If set to True and the model type is KMeans or NearestCentroids, the function 
+        will return the model clusters distances.
+    return_str: bool, optional
+        If set to True, the function str will be returned.
+
+
+    Returns
+    -------
+    str / func
+        Python function
+        """
+        if not(return_str):
+            func = self.to_python(name=name, return_proba=return_proba, return_distance_clusters=return_distance_clusters, return_str=True,)
+            _locals = locals()
+            exec(func, globals(), _locals)
+            return _locals[name]
+        str_representation = "def {}(X):\n".format(name)
+        final_function = "X"
+        for idx, step in enumerate(self.steps):
+            str_representation += "\t" + step[1].to_python(name=step[0],
+                                                           return_proba=return_proba,
+                                                           return_distance_clusters=return_distance_clusters,
+                                                           return_str=True).replace("\n", "\n\t") + "\n"
+            final_function = step[0]+"({})".format(final_function)
+        str_representation += "\treturn {}".format(final_function)
+        return str_representation
+
+
     # ---#
     def to_sklearn(self):
         """

diff --git a/verticapy/learn/vmodel.py b/verticapy/learn/vmodel.py
@@ -1836,7 +1836,7 @@ def shapExplainer(self):
                 )
         else:
             raise FunctionError(
-                "The method 'to_shapExplainer' is not available for model type '{}'.".format(
+                "The method 'shapExplainer' is not available for model type '{}'.".format(
                     self.type
                 )
             )
@@ -2500,7 +2500,6 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
             func += "\tresult = np.column_stack(L)\n"
             func += "\treturn result\n"
             return func
-            """
         elif self.type in ("NaiveBayes",):
             vdf = vdf_from_relation(self.input_relation, cursor=self.cursor)
             var_info = {}
@@ -2517,7 +2516,7 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
                     for c in self.classes_:
                         multinomial = self.get_attr("multinomial.{}".format(c))
                         var_info[elem][c] = multinomial["probability"][multinomial_incr]
-                        multinomial_incr += 1
+                    multinomial_incr += 1
                 elif vdf[elem].isnum():
                     var_info[elem]["type"] = "gaussian"
                     for c in self.classes_:
@@ -2532,52 +2531,78 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
                         if item.lower() == my_cat.lower():
                             my_cat = item
                             break
-                    var_info[elem]["proba"] = self.get_attr(my_cat).values
-            proba = {}
-            prior = self.get_attr("prior")
-            for idx, elem in enumerate(prior["class"]):
-                proba[elem] = prior["probability"][idx]
-            L = []
-            X = 40.0
-            for c in self.classes_:
-                result = proba[c]
+                    val = self.get_attr(my_cat).values
+                    for item in val:
+                        if item != "category":
+                            if item not in var_info[elem]:
+                                var_info[elem][item] = {}
+                            for i, p in enumerate(val[item]):
+                                var_info[elem][item][val["category"][i]] = p
+            var_info_simplified = []
+            for i in range(len(var_info)):
                 for elem in var_info:
-                    if var_info[elem]["type"] == "gaussian":
-                        all_proba = {}
-                        for k in self.classes_:
-                            all_proba[k] = 1 / np.sqrt(var_info[elem][k]["sigma_sq"]) * np.exp(- (X - var_info[elem][k]["mu"]) ** 2 / (2 * var_info[elem][k]["sigma_sq"]))
-                        result *= all_proba[c] / np.sum(np.array([all_proba[k] for k in self.classes_]) * np.array([proba[k] for k in self.classes_]))
-                    elif var_info[elem]["type"] == "bernoulli":
-                        sql += " * ({} - {}::int) / ({} - {}::int)".format(1 - var_info[elem][0], X[var_info[elem]["rank"]], 1 - var_info[elem][1], X[var_info[elem]["rank"]],)
-                    elif var_info[elem]["type"] == "multinomial":
-                        sql += " * POWER({}, {}) / POWER({}, {})".format(var_info[elem][0], X[var_info[elem]["rank"]], var_info[elem][1], X[var_info[elem]["rank"]],)
-                    elif var_info[elem]["type"] == "categorical":
-                        proba = var_info[elem]["proba"]
-                        list_tmp = []
-                        for idx, cat in enumerate(proba["category"]):
-                            list_tmp += ["{} = '{}' THEN {}".format(X[var_info[elem]["rank"]], cat, proba["0"][idx] / proba["1"][idx])]
-                        sql += " * (CASE WHEN " + " WHEN ".join(list_tmp) + " END)"
-                L += [result]
-            return L
+                    if var_info[elem]["rank"] == i:
+                        var_info_simplified += [var_info[elem]]
+                        break
+            for elem in var_info_simplified:
+                del elem["rank"]
+            prior = self.get_attr("prior").values["probability"]
+            func += "var_info_simplified = {}\n".format(var_info_simplified)
+            func += "\tprior = np.array({})\n".format(prior)
+            func += "\tclasses = {}\n".format(self.classes_)
+            func += "\tn, m = {}, {}\n".format(len(self.classes_), len(self.X))
+            func += "\tdef naive_bayes_score_row(X):\n"
+            func += "\t\tresult = []\n"
+            func += "\t\tfor c in classes:\n"
+            func += "\t\t\tsub_result = []\n"
+            func += "\t\t\tfor idx, elem in enumerate(X):\n"
+            func += "\t\t\t\tprob = var_info_simplified[idx]\n"
+            func += "\t\t\t\tif prob['type'] == 'multinomial':\n"
+            func += "\t\t\t\t\tprob = prob[c] ** float(X[idx])\n"
+            func += "\t\t\t\telif prob['type'] == 'bernoulli':\n"
+            func += "\t\t\t\t\tprob = prob[c] if X[idx] else 1 - prob[c]\n"
+            func += "\t\t\t\telif prob['type'] == 'categorical':\n"
+            func += "\t\t\t\t\tprob = prob[str(c)][X[idx]]\n"
+            func += "\t\t\t\telse:\n"
+            func += "\t\t\t\t\tprob = 1 / np.sqrt(2 * np.pi * prob[c]['sigma_sq']) * np.exp(- (float(X[idx]) - prob[c]['mu']) ** 2 / (2 * prob[c]['sigma_sq']))\n"
+            func += "\t\t\t\tsub_result += [prob]\n"
+            func += "\t\t\tresult += [sub_result]\n"
+            func += "\t\tresult = np.array(result).prod(axis=1) * prior\n"
+            if return_proba:
+                func += "\t\treturn result / result.sum()\n"
+            else:
+                func += "\t\treturn classes[np.argmax(result)]\n"
+            func += "\treturn np.apply_along_axis(naive_bayes_score_row, 1, X)\n"
+            return func
         elif self.type in ("OneHotEncoder",):
+            predictors = self.X
             details = self.param_.values
-            n = len(details["category_name"])
-            sql = []
-            cat_idx, current_cat = 0, details["category_name"][0]
-            for i in range(n):
-                if cat_idx != 0 or not(self.parameters["drop_first"]):
-                    end_name = details["category_level_index"][i] if self.parameters["column_naming"] != 'values' else details["category_level"][i]
-                    sql += ["(CASE WHEN \"{}\" = '{}' THEN 1 ELSE 0 END) AS \"{}_{}\"".format(details["category_name"][i], details["category_level"][i], details["category_name"][i], end_name)]
-                if current_cat != details["category_name"][i]:
-                    cat_idx = 0
-                    current_cat = details["category_name"][i]
+            n, m = len(predictors), len(details["category_name"])
+            positions = {}
+            for i in range(m):
+                val = str_column(details["category_name"][i])
+                if val not in positions:
+                    positions[val] = [i]
                 else:
-                    cat_idx += 1
-            sql = ", ".join(sql)
-            for idx, elem in enumerate(X):
-                sql = sql.replace(self.X[idx], str_column(X[idx]))
-            return sql
-            """
+                    positions[val] += [i]
+            category_level = []
+            for p in predictors:
+                pos = positions[p]
+                category_level += [details["category_level"][pos[0]:pos[-1] + 1]]
+            if self.parameters["drop_first"]:
+                category_level = [elem[1:] for elem in category_level]
+            func += "category_level = {}\n\t".format(category_level)
+            func += "def ooe_row(X):\n\t"
+            func += "\tresult = []\n\t"
+            func += "\tfor idx, elem in enumerate(X):\n\t\t"
+            func += "\tfor item in category_level[idx]:\n\t\t\t"
+            func += "\tif str(elem) == str(item):\n\t\t\t\t"
+            func += "\tresult += [1]\n\t\t\t"
+            func += "\telse:\n\t\t\t\t"
+            func += "\tresult += [0]\n\t"
+            func += "\treturn result\n"
+            func += "\treturn np.apply_along_axis(ooe_row, 1, X)\n"
+            return func
         elif self.type in ("RandomForestClassifier", "RandomForestRegressor", "XGBoostRegressor", "XGBoostClassifier",):
             def map_idx(x):
                 for idx, elem in enumerate(self.X):
@@ -2844,15 +2869,14 @@ def predict_tree(tree_dict, node_id: int, clusters_distance: list):
             sql = []
             cat_idx, current_cat = 0, details["category_name"][0]
             for i in range(n):
+                if current_cat != details["category_name"][i]:
+                    cat_idx = 0
+                    current_cat = details["category_name"][i]
                 if cat_idx != 0 or not(self.parameters["drop_first"]):
                     end_name = details["category_level_index"][i] if self.parameters["column_naming"] != 'values' else details["category_level"][i]
                     end_name = 'NULL' if end_name == None else end_name
                     sql += ["(CASE WHEN \"{}\" = {} THEN 1 ELSE 0 END) AS \"{}_{}\"".format(details["category_name"][i], "'" + str(details["category_level"][i]) + "'" if details["category_level"][i] != None else 'NULL', details["category_name"][i], end_name)]
-                if current_cat != details["category_name"][i]:
-                    cat_idx = 0
-                    current_cat = details["category_name"][i]
-                else:
-                    cat_idx += 1
+                cat_idx += 1
             sql = ", ".join(sql)
             for idx, elem in enumerate(X):
                 sql = sql.replace(self.X[idx], str_column(X[idx]))
@@ -3492,7 +3516,7 @@ def score(self, method: str = "accuracy", cutoff: float = 0.5):
         check_types([("cutoff", cutoff, [int, float],), ("method", method, [str],)])
         if method in ("accuracy", "acc"):
             return accuracy_score(
-                self.y, self.deploySQL(cutoff), self.test_relation, self.cursor
+                self.y, self.deploySQL(cutoff), self.test_relation, self.cursor, pos_label=1,
             )
         elif method == "aic":
             return aic_bic(self.y, self.deploySQL(), self.test_relation, self.cursor, len(self.X))[0]
@@ -3501,15 +3525,15 @@ def score(self, method: str = "accuracy", cutoff: float = 0.5):
         elif method == "prc_auc":
             return prc_auc(self.y, self.deploySQL(), self.test_relation, self.cursor)
         elif method == "auc":
-            return roc_curve(self.y, self.deploySQL(), self.test_relation, self.cursor, auc_roc=True,)
+            return roc_curve(self.y, self.deploySQL(), self.test_relation, self.cursor, auc_roc=True, nbins=10000,)
         elif method in ("best_cutoff", "best_threshold"):
             return roc_curve(
                 self.y,
                 self.deploySQL(),
                 self.test_relation,
                 self.cursor,
                 best_threshold=True,
-                nbins=1000,
+                nbins=10000,
             )
         elif method in ("recall", "tpr"):
             return recall_score(

diff --git a/verticapy/tests/vModel/test_decision_tree_classifier.py b/verticapy/tests/vModel/test_decision_tree_classifier.py
@@ -223,6 +223,21 @@ def test_to_sklearn(self, model):
 
         # 'predict_proba'
 
+    def test_to_python(self, model, titanic_vd):
+        model_test = DecisionTreeClassifier("rfc_python_test", cursor=model.cursor)
+        model_test.drop()
+        model_test.fit(titanic_vd, ["age", "fare", "sex"], "embarked")
+        model_test.cursor.execute(
+            "SELECT PREDICT_RF_CLASSIFIER(30.0, 45.0, 'male' USING PARAMETERS model_name = 'rfc_python_test', match_by_pos=True)"
+        )
+        prediction = model_test.cursor.fetchone()[0]
+        assert prediction == model_test.to_python(return_str=False)([[30.0, 45.0, 'male']])[0]
+        model_test.cursor.execute(
+            "SELECT PREDICT_RF_CLASSIFIER(30.0, 145.0, 'female' USING PARAMETERS model_name = 'rfc_python_test', match_by_pos=True)"
+        )
+        prediction = model_test.cursor.fetchone()[0]
+        assert prediction == model_test.to_python(return_str=False)([[30.0, 145.0, 'female']])[0]
+
     def test_to_sql(self, model, titanic_vd):
         model_test = DecisionTreeClassifier("rfc_sql_test", cursor=model.cursor)
         model_test.drop()