Small corrections (#72)

* Small corrections Small Correction - set_parameters - some tests - udf * Update test_dummy_tree_classifier.py * problem with shap * Some corrections + New Tests - New tests for PCA, SVD, Normalizer & OneHotEncoder - Correction SVD to_sklearn - Correction regression_report * Update test_linear_regression.py * Small Changes - Deploy Inverse SQL - Model cleaning * Update test_normalizer.py Added model2.drop() at line 177 and model3.drop() at line 197. Co-authored-by: Arash Fard <[email protected]>
vertica · Jan 20, 2021 · 02b7d64 · 02b7d64
1 parent b0e800d
commit 02b7d64
Show file tree

Hide file tree

Showing 19 changed files with 1,032 additions and 95 deletions.
diff --git a/verticapy/geo.py b/verticapy/geo.py
@@ -100,8 +100,9 @@ def create_index(
 
 Returns
 -------
-vDataFrame
-    object result of the join.
+tablesample
+    An object containing the result. For more information, see
+    utilities.tablesample.
     """
     check_types(
         [

diff --git a/verticapy/learn/linear_model.py b/verticapy/learn/linear_model.py
@@ -163,7 +163,8 @@ def __init__(
             }
         )
         for elem in ["l1_ratio"]:
-            del self.parameters[elem]
+            if elem in self.parameters:
+                del self.parameters[elem]
         cursor = check_cursor(cursor)[0]
         self.cursor = cursor
         version(cursor=cursor, condition=[8, 0, 0])
@@ -214,7 +215,8 @@ def __init__(
             }
         )
         for elem in ["l1_ratio", "C"]:
-            del self.parameters[elem]
+            if elem in self.parameters:
+                del self.parameters[elem]
         cursor = check_cursor(cursor)[0]
         self.cursor = cursor
         version(cursor=cursor, condition=[8, 0, 0])
@@ -342,7 +344,8 @@ def __init__(
             }
         )
         for elem in ["l1_ratio"]:
-            del self.parameters[elem]
+            if elem in self.parameters:
+                del self.parameters[elem]
         cursor = check_cursor(cursor)[0]
         self.cursor = cursor
         version(cursor=cursor, condition=[8, 0, 0])
diff --git a/verticapy/learn/pipeline.py b/verticapy/learn/pipeline.py
@@ -60,7 +60,7 @@ class Pipeline:
     """
 ---------------------------------------------------------------------------
 Creates a Pipeline object. Sequentially apply a list of transforms and a 
-final estimator. 
+final estimator. The intermediate steps must implement a transform method.
 
 Parameters
 ----------
@@ -216,7 +216,10 @@ def predict(
         current_vdf = vdf
         for idx, step in enumerate(self.steps):
             if idx == len(self.steps) - 1:
-                current_vdf = step[1].predict(current_vdf, X_new, name = name)
+                try:
+                    current_vdf = step[1].predict(current_vdf, X_new, name = name, inplace = False)
+                except:
+                    current_vdf = step[1].predict(current_vdf, X_new, name = name)
             else:
                 current_vdf = step[1].transform(current_vdf, X_new)
                 X_new = step[1].get_names(X = X)

diff --git a/verticapy/learn/vmodel.py b/verticapy/learn/vmodel.py
diff --git a/verticapy/tests/vModel/test_decision_tree_regressor.py b/verticapy/tests/vModel/test_decision_tree_regressor.py
@@ -142,7 +142,7 @@ def test_get_params(self, model):
     def test_get_plot(self):
         pass
 
-    @pytest.mark.xfail(reason = "vmodel.py, line 1725, in to_sklearn: local variable 'model' referenced before assignment")
+    @pytest.mark.xfail(reason = "pb with sklearn trees")
     def test_to_sklearn(self, base):
         base.cursor.execute("DROP MODEL IF EXISTS tr_model_sk_test")
 

diff --git a/verticapy/tests/vModel/test_dummy_tree_classifier.py b/verticapy/tests/vModel/test_dummy_tree_classifier.py
@@ -275,7 +275,7 @@ def test_set_cursor(self):
 
     def test_set_params(self, model):
         model.set_params({"nbins": 100})
-
+        # Nothing will change as Dummy Trees have no parameters
         assert model.get_params()["nbins"] == 100
 
     def test_model_from_vDF(self, base, dtc_data_vd):

diff --git a/verticapy/tests/vModel/test_dummy_tree_regressor.py b/verticapy/tests/vModel/test_dummy_tree_regressor.py
@@ -140,7 +140,7 @@ def test_get_params(self, model):
     def test_get_plot(self):
         pass
 
-    @pytest.mark.xfail(reason = "vmodel.py, line 1725, in to_sklearn: local variable 'model' referenced before assignment")
+    @pytest.mark.xfail(reason = "pb with sklearn trees")
     def test_to_sklearn(self, base):
         base.cursor.execute("DROP MODEL IF EXISTS tr_model_sk_test")
 
@@ -253,9 +253,9 @@ def test_set_cursor(self, base):
         model_test.drop()
 
     def test_set_params(self, model):
+        # Nothing will change as Dummy Trees have no parameters
         model.set_params({"max_features": 100})
-
-        assert model.get_params()["max_features"] == 100
+        assert model.get_params()["max_features"] == "max"
 
     def test_model_from_vDF(self, base, tr_data_vd):
         base.cursor.execute("DROP MODEL IF EXISTS tr_from_vDF")

diff --git a/verticapy/tests/vModel/test_elastic_net.py b/verticapy/tests/vModel/test_elastic_net.py
@@ -148,6 +148,7 @@ def test_to_sklearn(self, model):
         prediction = model.cursor.fetchone()[0]
         assert prediction == pytest.approx(md.predict([[3.0, 11.0, 93.0]])[0][0])
 
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(5.81837771)

diff --git a/verticapy/tests/vModel/test_lasso.py b/verticapy/tests/vModel/test_lasso.py
@@ -147,6 +147,7 @@ def test_to_sklearn(self, model):
         prediction = model.cursor.fetchone()[0]
         assert prediction == pytest.approx(md.predict([[3.0, 11.0, 93.0]])[0][0])
 
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(5.81837771)

diff --git a/verticapy/tests/vModel/test_linear_regression.py b/verticapy/tests/vModel/test_linear_regression.py
@@ -145,6 +145,7 @@ def test_to_sklearn(self, model):
         prediction = model.cursor.fetchone()[0]
         assert prediction == pytest.approx(md.predict([[3.0, 11.0, 93.0]])[0][0])
 
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(5.81837771)

diff --git a/verticapy/tests/vModel/test_linear_svc.py b/verticapy/tests/vModel/test_linear_svc.py
@@ -133,6 +133,7 @@ def test_to_sklearn(self, model):
 
         # 'LinearSVC' object (md) has no attribute 'predict_proba'
 
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(-0.22667938806360247)

diff --git a/verticapy/tests/vModel/test_linear_svr.py b/verticapy/tests/vModel/test_linear_svr.py
@@ -138,6 +138,7 @@ def test_to_sklearn(self, model):
         prediction = model.cursor.fetchone()[0]
         assert prediction == pytest.approx(md.predict([[3.0, 11.0, 93.0]])[0][0])
 
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(5.819113657580594)

diff --git a/verticapy/tests/vModel/test_logistic_regression.py b/verticapy/tests/vModel/test_logistic_regression.py
@@ -141,7 +141,7 @@ def test_to_sklearn(self, model):
         prediction = model.cursor.fetchone()[0]
         assert prediction == pytest.approx(md.predict_proba([[11.0, 1993.0]])[0][1])
 
-    @pytest.mark.skip(reason="shap doesn't want to work on python3.6")
+    @pytest.mark.skip(reason="problem with shap installation")
     def test_shapExplainer(self, model):
         explainer = model.shapExplainer()
         assert explainer.expected_value[0] == pytest.approx(-0.4617437138350809)

diff --git a/verticapy/tests/vModel/test_normalizer.py b/verticapy/tests/vModel/test_normalizer.py
@@ -0,0 +1,241 @@
+# (c) Copyright [2018-2020] Micro Focus or one of its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest, warnings, sys
+from verticapy.learn.preprocessing import Normalizer
+from verticapy import drop_table
+
+from verticapy import set_option
+
+set_option("print_info", False)
+
+
+@pytest.fixture(scope="module")
+def winequality_vd(base):
+    from verticapy.learn.datasets import load_winequality
+
+    winequality = load_winequality(cursor=base.cursor)
+    yield winequality
+    with warnings.catch_warnings(record=True) as w:
+        drop_table(name="public.winequality", cursor=base.cursor)
+
+
+@pytest.fixture(scope="module")
+def model(base, winequality_vd):
+    base.cursor.execute("DROP MODEL IF EXISTS norm_model_test")
+    model_class = Normalizer("norm_model_test", cursor=base.cursor)
+    model_class.fit(
+        "public.winequality", ["citric_acid", "residual_sugar", "alcohol"]
+    )
+    yield model_class
+    model_class.drop()
+
+
+class TestNormalizer:
+
+    def test_deploySQL(self, model):
+        expected_sql = 'APPLY_NORMALIZE("citric_acid", "residual_sugar", "alcohol" USING PARAMETERS model_name = \'norm_model_test\', match_by_pos = \'true\')'
+        result_sql = model.deploySQL()
+
+        assert result_sql == expected_sql
+
+    def test_deployInverseSQL(self, model):
+        expected_sql = "REVERSE_NORMALIZE(\"citric_acid\", \"residual_sugar\", \"alcohol\" USING PARAMETERS model_name = 'norm_model_test', match_by_pos = 'true')"
+        result_sql = model.deployInverseSQL()
+
+        assert result_sql == expected_sql
+
+    def test_drop(self, base):
+        base.cursor.execute("DROP MODEL IF EXISTS norm_model_test_drop")
+        model_test = Normalizer("norm_model_test_drop", cursor=base.cursor)
+        model_test.fit("public.winequality", ["alcohol", "quality"])
+
+        base.cursor.execute(
+            "SELECT model_name FROM models WHERE model_name = 'norm_model_test_drop'"
+        )
+        assert base.cursor.fetchone()[0] == "norm_model_test_drop"
+
+        model_test.drop()
+        base.cursor.execute(
+            "SELECT model_name FROM models WHERE model_name = 'norm_model_test_drop'"
+        )
+        assert base.cursor.fetchone() is None
+
+    def test_get_attr(self, model):
+        m_att = model.get_attr()
+
+        assert m_att["attr_name"] == [
+            "details",
+        ]
+        assert m_att["attr_fields"] == [
+            "column_name, avg, std_dev",
+        ]
+        assert m_att["#_of_rows"] == [3]
+
+        m_att_details = model.get_attr(attr_name="details")
+
+        assert m_att_details["column_name"] == [
+            "citric_acid",
+            "residual_sugar",
+            "alcohol",
+        ]
+        assert m_att_details["avg"][0] == pytest.approx(0.318633215330152, abs=1e-6)
+        assert m_att_details["avg"][1] == pytest.approx(5.44323533938741, abs=1e-6)
+        assert m_att_details["avg"][2] == pytest.approx(10.4918008311528, abs=1e-6)
+        assert m_att_details["std_dev"][0] == pytest.approx(0.145317864897592, abs=1e-6)
+        assert m_att_details["std_dev"][1] == pytest.approx(4.75780374314742, abs=1e-6)
+        assert m_att_details["std_dev"][2] == pytest.approx(1.192711748871)
+
+    def test_get_params(self, model):
+        assert model.get_params() == {'method': 'zscore'}
+
+    def test_to_sklearn(self, model):
+        # Zscore
+        md = model.to_sklearn()
+        model.cursor.execute(
+            "SELECT APPLY_NORMALIZE(citric_acid, residual_sugar, alcohol USING PARAMETERS model_name = '{}', match_by_pos=True) FROM (SELECT 3.0 AS citric_acid, 11.0 AS residual_sugar, 93. AS alcohol) x".format(
+                model.name
+            )
+        )
+        prediction = model.cursor.fetchone()[0]
+        assert prediction == pytest.approx(md.transform([[3.0, 11.0, 93.0]])[0][0])
+        # Minmax
+        model2 = Normalizer("norm_model_test2", cursor=model.cursor, method = "minmax")
+        model2.drop()
+        model2.fit(
+            "public.winequality", ["citric_acid", "residual_sugar", "alcohol"]
+        )
+        md = model2.to_sklearn()
+        model2.cursor.execute(
+            "SELECT APPLY_NORMALIZE(citric_acid, residual_sugar, alcohol USING PARAMETERS model_name = '{}', match_by_pos=True) FROM (SELECT 3.0 AS citric_acid, 11.0 AS residual_sugar, 93. AS alcohol) x".format(
+                model2.name
+            )
+        )
+        prediction = model2.cursor.fetchone()[0]
+        model2.drop()
+        assert prediction == pytest.approx(md.transform([[3.0, 11.0, 93.0]])[0][0])
+        # Robust Zscore
+        model3 = Normalizer("norm_model_test2", cursor=model.cursor, method = "robust_zscore")
+        model3.drop()
+        model3.fit(
+            "public.winequality", ["citric_acid", "residual_sugar", "alcohol"]
+        )
+        md = model3.to_sklearn()
+        model3.cursor.execute(
+            "SELECT APPLY_NORMALIZE(citric_acid, residual_sugar, alcohol USING PARAMETERS model_name = '{}', match_by_pos=True) FROM (SELECT 3.0 AS citric_acid, 11.0 AS residual_sugar, 93. AS alcohol) x".format(
+                model3.name
+            )
+        )
+        prediction = model3.cursor.fetchone()[0]
+        model3.drop()
+        assert prediction == pytest.approx(md.transform([[3.0, 11.0, 93.0]])[0][0])
+
+    def test_get_transform(self, winequality_vd, model):
+        # Zscore
+        winequality_trans = model.transform(
+            winequality_vd,
+            X=["citric_acid", "residual_sugar", "alcohol"]
+        )
+        assert winequality_trans["citric_acid"].mean() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        assert winequality_trans["residual_sugar"].mean() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        assert winequality_trans["alcohol"].mean() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        # Minmax
+        model2 = Normalizer("norm_model_test2", cursor=model.cursor, method = "minmax")
+        model2.drop()
+        model2.fit(
+            "public.winequality", ["citric_acid", "residual_sugar", "alcohol"]
+        )
+        winequality_trans = model2.transform(
+            winequality_vd,
+            X=["citric_acid", "residual_sugar", "alcohol"]
+        )
+        assert winequality_trans["citric_acid"].min() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        assert winequality_trans["residual_sugar"].max() == pytest.approx(
+            1.0, abs=1e-6
+        )
+        assert winequality_trans["alcohol"].min() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        model2.drop()
+        # Robust Zscore
+        model3 = Normalizer("norm_model_test2", cursor=model.cursor, method = "robust_zscore")
+        model3.drop()
+        model3.fit(
+            "public.winequality", ["citric_acid", "residual_sugar", "alcohol"]
+        )
+        winequality_trans = model3.transform(
+            winequality_vd,
+            X=["citric_acid", "residual_sugar", "alcohol"]
+        )
+        assert winequality_trans["citric_acid"].median() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        assert winequality_trans["residual_sugar"].median() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        assert winequality_trans["alcohol"].median() == pytest.approx(
+            0.0, abs=1e-6
+        )
+        model3.drop()
+
+    def test_get_inverse_transform(self, winequality_vd, model):
+        winequality_trans = model.inverse_transform(
+            winequality_vd,
+            X=["citric_acid", "residual_sugar", "alcohol"]
+        )
+        assert winequality_trans["citric_acid"].mean() == pytest.approx(
+            0.364936313867385, abs=1e-6
+        )
+        assert winequality_trans["residual_sugar"].mean() == pytest.approx(
+            31.3410808119571, abs=1e-6
+        )
+        assert winequality_trans["alcohol"].mean() == pytest.approx(
+            23.0054949492833, abs=1e-6
+        )
+
+    def test_set_cursor(self, base):
+        model_test = Normalizer("norm_cursor_test", cursor=base.cursor)
+        # TODO: creat a new cursor
+        model_test.set_cursor(base.cursor)
+        model_test.drop()
+        model_test.fit("public.winequality", ["alcohol"])
+
+        base.cursor.execute(
+            "SELECT model_name FROM models WHERE model_name = 'norm_cursor_test'"
+        )
+        assert base.cursor.fetchone()[0] == "norm_cursor_test"
+        model_test.drop()
+
+    def test_set_params(self, model):
+        model.set_params({"method": "robust_zscore"})
+        assert model.get_params()["method"] == "robust_zscore"
+        model.set_params({"method": "zscore"})
+        assert model.get_params()["method"] == "zscore"
+
+    def test_model_from_vDF(self, base, winequality_vd):
+        base.cursor.execute("DROP MODEL IF EXISTS norm_vDF")
+        model_test = Normalizer("norm_vDF", cursor=base.cursor)
+        model_test.fit(winequality_vd, ["alcohol", "quality"])
+        base.cursor.execute(
+            "SELECT model_name FROM models WHERE model_name = 'norm_vDF'"
+        )
+        assert base.cursor.fetchone()[0] == "norm_vDF"
+        model_test.drop()