Skip to content

Commit

Permalink
Multiple corrections and code completion (#98)
Browse files Browse the repository at this point in the history
* Multiple corrections and code completion

 - Display Bug for autoML - corrected
 - Adding to_python for Pipelines
 - to_python is now available for NaiveBayes and OneHotEncoder
 - correction to_sql for OneHotEncoder
 - correction of read_csv for headers with space
 - correction of outlier_plot

* to_python correction

 - correcting to_python
 - adding tests to_python

* auc value correction

* Bug + simplification

Test simplification
Correction bug model selection

* Update test_onehotencoder.py

* Small corrections
  • Loading branch information
oualib authored May 4, 2021
1 parent e3f3040 commit c4e8b74
Show file tree
Hide file tree
Showing 23 changed files with 318 additions and 71 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

setuptools.setup(
name = 'verticapy',
version = '0.6.0',
version = '0.6.1',
author = "Badr Ouali",
author_email = "[email protected]",
url = "https://github.com/vertica/VerticaPy",
Expand Down
2 changes: 1 addition & 1 deletion verticapy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
# of moving data around for processing, VerticaPy brings the logic to the data.
#
#
__version__ = "0.6.0"
__version__ = "0.6.1"
__author__ = "Badr Ouali"
__author_email__ = "[email protected]"
__description__ = """VerticaPy simplifies data exploration, data cleaning and machine learning in Vertica."""
Expand Down
2 changes: 1 addition & 1 deletion verticapy/learn/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def accuracy_score(
y_score: str,
input_relation: (str, vDataFrame),
cursor=None,
pos_label: (int, float, str) = 1,
pos_label: (int, float, str) = None,
):
"""
---------------------------------------------------------------------------
Expand Down
16 changes: 11 additions & 5 deletions verticapy/learn/mlplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ def plot_importance(
if print_legend:
orange = mpatches.Patch(color=color_dict(style_kwds, 1), label="sign -")
blue = mpatches.Patch(color=color_dict(style_kwds, 0), label="sign +")
ax.legend(handles=[orange, blue], loc="center left", bbox_to_anchor=[1, 0.5])
ax.legend(handles=[blue, orange,], loc="center left", bbox_to_anchor=[1, 0.5])
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.set_ylabel("Features")
Expand Down Expand Up @@ -610,28 +610,34 @@ def plot_bubble_ml(x: list, y: list, s: list = None, z: list = [], x_label: str
ax.spines['bottom'].set_position('center')
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.text(max(x) + 0.1, max(y) + 0.1,
delta_x = (max(x) - min(x)) * 0.1
delta_y = (max(y) - min(y)) * 0.1
plt.text(max(x) + delta_x if reverse[0] else min(x) - delta_x,
max(y) + delta_y if reverse[1] else min(y) - delta_y,
"Modest", size=15, rotation=130.,
ha="center", va="center",
bbox=dict(boxstyle="round",
ec=gen_colors()[0],
fc=gen_colors()[0],
alpha=0.3),)
plt.text(max(x) + 0.1, min(y) - 0.1,
plt.text(max(x) + delta_x if reverse[0] else min(x) - delta_x,
min(y) - delta_y if reverse[1] else max(y) + delta_y,
"Efficient", size=15, rotation=30.,
ha="center", va="center",
bbox=dict(boxstyle="round",
ec=gen_colors()[1],
fc=gen_colors()[1],
alpha=0.3),)
plt.text(min(x) - 0.1, max(y) + 0.1,
plt.text(min(x) - delta_x if reverse[0] else max(x) + delta_x,
max(y) + delta_y if reverse[1] else min(y) - delta_y,
"Performant", size=15, rotation=-130.,
ha="center", va="center",
bbox=dict(boxstyle="round",
ec=gen_colors()[2],
fc=gen_colors()[2],
alpha=0.3),)
plt.text(min(x) - 0.1, min(y) - 0.1,
plt.text(min(x) - delta_x if reverse[0] else max(x) + delta_x,
min(y) - delta_y if reverse[1] else max(y) + delta_y,
"Performant & Efficient", size=15, rotation=-30.,
ha="center", va="center",
bbox=dict(boxstyle="round",
Expand Down
10 changes: 8 additions & 2 deletions verticapy/learn/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ def bayesian_search_cv(
else:
result[item] += [None]
result["score"] = param_gs["avg_score"]
if 'max_features' in result:
for idx, elem in enumerate(result["max_features"]):
if elem == "auto":
result["max_features"][idx] = int(np.floor(np.sqrt(len(X))) + 1)
elif elem == "max":
result["max_features"][idx] = int(len(X))
result = tablesample(result).to_sql()
if isinstance(input_relation, str):
schema, relation = schema_relation(input_relation)
Expand Down Expand Up @@ -1016,11 +1022,11 @@ def gen_params_grid(estimator,
params_grid["sample"] = [0.7,]
params_grid["n_estimators"] = [20,]
elif optimized_grid == -666:
result = {"max_features": {"type": int, "range": [1, len(all_params),], "nbins": nbins,},
result = {"max_features": {"type": int, "range": [1, max_nfeatures,], "nbins": nbins,},
"max_leaf_nodes": {"type": int, "range": [32, 1e9,], "nbins": nbins,},
"max_depth": {"type": int, "range": [2, 30,], "nbins": nbins,},
"min_samples_leaf": {"type": int, "range": [1, 15,], "nbins": nbins,},
"min_samples_leaf": {"type": float, "range": [0.0, 0.1,], "nbins": nbins,},
"min_info_gain": {"type": float, "range": [0.0, 0.1,], "nbins": nbins,},
"nbins": {"type": int, "range": [10, 1000,], "nbins": nbins,},}
if isinstance(RandomForestRegressor, RandomForestClassifier,):
result["sample"] = {"type": float, "range": [0.1, 1.0,], "nbins": nbins,}
Expand Down
2 changes: 1 addition & 1 deletion verticapy/learn/neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def score(
pos_label=None,
)
else:
return accuracy_score(y_true, y_score, input_relation, self.cursor)
return accuracy_score(y_true, y_score, input_relation, self.cursor, pos_label=pos_label,)
elif method == "auc":
return auc(y_true, y_proba, input_relation, self.cursor)
elif method == "prc_auc":
Expand Down
48 changes: 48 additions & 0 deletions verticapy/learn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def fit(
self.test_relation = self.steps[-1][1].test_relation
except:
pass
return self

# ---#
def get_params(self):
Expand Down Expand Up @@ -409,6 +410,53 @@ def set_params(self, parameters: dict = {}):
if param.lower() == step[0].lower():
step[1].set_params(parameters[param])

# ---#
def to_python(self,
name: str = "predict",
return_proba: bool = False,
return_distance_clusters: bool = False,
return_str: bool = False,):
"""
---------------------------------------------------------------------------
Returns the Python code needed to deploy the pipeline without using built-in
Vertica functions.
Parameters
----------
name: str, optional
Function Name.
return_proba: bool, optional
If set to True and the model is a classifier, the function will return
the model probabilities.
return_distance_clusters: bool, optional
If set to True and the model type is KMeans or NearestCentroids, the function
will return the model clusters distances.
return_str: bool, optional
If set to True, the function str will be returned.
Returns
-------
str / func
Python function
"""
if not(return_str):
func = self.to_python(name=name, return_proba=return_proba, return_distance_clusters=return_distance_clusters, return_str=True,)
_locals = locals()
exec(func, globals(), _locals)
return _locals[name]
str_representation = "def {}(X):\n".format(name)
final_function = "X"
for idx, step in enumerate(self.steps):
str_representation += "\t" + step[1].to_python(name=step[0],
return_proba=return_proba,
return_distance_clusters=return_distance_clusters,
return_str=True).replace("\n", "\n\t") + "\n"
final_function = step[0]+"({})".format(final_function)
str_representation += "\treturn {}".format(final_function)
return str_representation


# ---#
def to_sklearn(self):
"""
Expand Down
130 changes: 77 additions & 53 deletions verticapy/learn/vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ def shapExplainer(self):
)
else:
raise FunctionError(
"The method 'to_shapExplainer' is not available for model type '{}'.".format(
"The method 'shapExplainer' is not available for model type '{}'.".format(
self.type
)
)
Expand Down Expand Up @@ -2500,7 +2500,6 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
func += "\tresult = np.column_stack(L)\n"
func += "\treturn result\n"
return func
"""
elif self.type in ("NaiveBayes",):
vdf = vdf_from_relation(self.input_relation, cursor=self.cursor)
var_info = {}
Expand All @@ -2517,7 +2516,7 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
for c in self.classes_:
multinomial = self.get_attr("multinomial.{}".format(c))
var_info[elem][c] = multinomial["probability"][multinomial_incr]
multinomial_incr += 1
multinomial_incr += 1
elif vdf[elem].isnum():
var_info[elem]["type"] = "gaussian"
for c in self.classes_:
Expand All @@ -2532,52 +2531,78 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di
if item.lower() == my_cat.lower():
my_cat = item
break
var_info[elem]["proba"] = self.get_attr(my_cat).values
proba = {}
prior = self.get_attr("prior")
for idx, elem in enumerate(prior["class"]):
proba[elem] = prior["probability"][idx]
L = []
X = 40.0
for c in self.classes_:
result = proba[c]
val = self.get_attr(my_cat).values
for item in val:
if item != "category":
if item not in var_info[elem]:
var_info[elem][item] = {}
for i, p in enumerate(val[item]):
var_info[elem][item][val["category"][i]] = p
var_info_simplified = []
for i in range(len(var_info)):
for elem in var_info:
if var_info[elem]["type"] == "gaussian":
all_proba = {}
for k in self.classes_:
all_proba[k] = 1 / np.sqrt(var_info[elem][k]["sigma_sq"]) * np.exp(- (X - var_info[elem][k]["mu"]) ** 2 / (2 * var_info[elem][k]["sigma_sq"]))
result *= all_proba[c] / np.sum(np.array([all_proba[k] for k in self.classes_]) * np.array([proba[k] for k in self.classes_]))
elif var_info[elem]["type"] == "bernoulli":
sql += " * ({} - {}::int) / ({} - {}::int)".format(1 - var_info[elem][0], X[var_info[elem]["rank"]], 1 - var_info[elem][1], X[var_info[elem]["rank"]],)
elif var_info[elem]["type"] == "multinomial":
sql += " * POWER({}, {}) / POWER({}, {})".format(var_info[elem][0], X[var_info[elem]["rank"]], var_info[elem][1], X[var_info[elem]["rank"]],)
elif var_info[elem]["type"] == "categorical":
proba = var_info[elem]["proba"]
list_tmp = []
for idx, cat in enumerate(proba["category"]):
list_tmp += ["{} = '{}' THEN {}".format(X[var_info[elem]["rank"]], cat, proba["0"][idx] / proba["1"][idx])]
sql += " * (CASE WHEN " + " WHEN ".join(list_tmp) + " END)"
L += [result]
return L
if var_info[elem]["rank"] == i:
var_info_simplified += [var_info[elem]]
break
for elem in var_info_simplified:
del elem["rank"]
prior = self.get_attr("prior").values["probability"]
func += "var_info_simplified = {}\n".format(var_info_simplified)
func += "\tprior = np.array({})\n".format(prior)
func += "\tclasses = {}\n".format(self.classes_)
func += "\tn, m = {}, {}\n".format(len(self.classes_), len(self.X))
func += "\tdef naive_bayes_score_row(X):\n"
func += "\t\tresult = []\n"
func += "\t\tfor c in classes:\n"
func += "\t\t\tsub_result = []\n"
func += "\t\t\tfor idx, elem in enumerate(X):\n"
func += "\t\t\t\tprob = var_info_simplified[idx]\n"
func += "\t\t\t\tif prob['type'] == 'multinomial':\n"
func += "\t\t\t\t\tprob = prob[c] ** float(X[idx])\n"
func += "\t\t\t\telif prob['type'] == 'bernoulli':\n"
func += "\t\t\t\t\tprob = prob[c] if X[idx] else 1 - prob[c]\n"
func += "\t\t\t\telif prob['type'] == 'categorical':\n"
func += "\t\t\t\t\tprob = prob[str(c)][X[idx]]\n"
func += "\t\t\t\telse:\n"
func += "\t\t\t\t\tprob = 1 / np.sqrt(2 * np.pi * prob[c]['sigma_sq']) * np.exp(- (float(X[idx]) - prob[c]['mu']) ** 2 / (2 * prob[c]['sigma_sq']))\n"
func += "\t\t\t\tsub_result += [prob]\n"
func += "\t\t\tresult += [sub_result]\n"
func += "\t\tresult = np.array(result).prod(axis=1) * prior\n"
if return_proba:
func += "\t\treturn result / result.sum()\n"
else:
func += "\t\treturn classes[np.argmax(result)]\n"
func += "\treturn np.apply_along_axis(naive_bayes_score_row, 1, X)\n"
return func
elif self.type in ("OneHotEncoder",):
predictors = self.X
details = self.param_.values
n = len(details["category_name"])
sql = []
cat_idx, current_cat = 0, details["category_name"][0]
for i in range(n):
if cat_idx != 0 or not(self.parameters["drop_first"]):
end_name = details["category_level_index"][i] if self.parameters["column_naming"] != 'values' else details["category_level"][i]
sql += ["(CASE WHEN \"{}\" = '{}' THEN 1 ELSE 0 END) AS \"{}_{}\"".format(details["category_name"][i], details["category_level"][i], details["category_name"][i], end_name)]
if current_cat != details["category_name"][i]:
cat_idx = 0
current_cat = details["category_name"][i]
n, m = len(predictors), len(details["category_name"])
positions = {}
for i in range(m):
val = str_column(details["category_name"][i])
if val not in positions:
positions[val] = [i]
else:
cat_idx += 1
sql = ", ".join(sql)
for idx, elem in enumerate(X):
sql = sql.replace(self.X[idx], str_column(X[idx]))
return sql
"""
positions[val] += [i]
category_level = []
for p in predictors:
pos = positions[p]
category_level += [details["category_level"][pos[0]:pos[-1] + 1]]
if self.parameters["drop_first"]:
category_level = [elem[1:] for elem in category_level]
func += "category_level = {}\n\t".format(category_level)
func += "def ooe_row(X):\n\t"
func += "\tresult = []\n\t"
func += "\tfor idx, elem in enumerate(X):\n\t\t"
func += "\tfor item in category_level[idx]:\n\t\t\t"
func += "\tif str(elem) == str(item):\n\t\t\t\t"
func += "\tresult += [1]\n\t\t\t"
func += "\telse:\n\t\t\t\t"
func += "\tresult += [0]\n\t"
func += "\treturn result\n"
func += "\treturn np.apply_along_axis(ooe_row, 1, X)\n"
return func
elif self.type in ("RandomForestClassifier", "RandomForestRegressor", "XGBoostRegressor", "XGBoostClassifier",):
def map_idx(x):
for idx, elem in enumerate(self.X):
Expand Down Expand Up @@ -2844,15 +2869,14 @@ def predict_tree(tree_dict, node_id: int, clusters_distance: list):
sql = []
cat_idx, current_cat = 0, details["category_name"][0]
for i in range(n):
if current_cat != details["category_name"][i]:
cat_idx = 0
current_cat = details["category_name"][i]
if cat_idx != 0 or not(self.parameters["drop_first"]):
end_name = details["category_level_index"][i] if self.parameters["column_naming"] != 'values' else details["category_level"][i]
end_name = 'NULL' if end_name == None else end_name
sql += ["(CASE WHEN \"{}\" = {} THEN 1 ELSE 0 END) AS \"{}_{}\"".format(details["category_name"][i], "'" + str(details["category_level"][i]) + "'" if details["category_level"][i] != None else 'NULL', details["category_name"][i], end_name)]
if current_cat != details["category_name"][i]:
cat_idx = 0
current_cat = details["category_name"][i]
else:
cat_idx += 1
cat_idx += 1
sql = ", ".join(sql)
for idx, elem in enumerate(X):
sql = sql.replace(self.X[idx], str_column(X[idx]))
Expand Down Expand Up @@ -3492,7 +3516,7 @@ def score(self, method: str = "accuracy", cutoff: float = 0.5):
check_types([("cutoff", cutoff, [int, float],), ("method", method, [str],)])
if method in ("accuracy", "acc"):
return accuracy_score(
self.y, self.deploySQL(cutoff), self.test_relation, self.cursor
self.y, self.deploySQL(cutoff), self.test_relation, self.cursor, pos_label=1,
)
elif method == "aic":
return aic_bic(self.y, self.deploySQL(), self.test_relation, self.cursor, len(self.X))[0]
Expand All @@ -3501,15 +3525,15 @@ def score(self, method: str = "accuracy", cutoff: float = 0.5):
elif method == "prc_auc":
return prc_auc(self.y, self.deploySQL(), self.test_relation, self.cursor)
elif method == "auc":
return roc_curve(self.y, self.deploySQL(), self.test_relation, self.cursor, auc_roc=True,)
return roc_curve(self.y, self.deploySQL(), self.test_relation, self.cursor, auc_roc=True, nbins=10000,)
elif method in ("best_cutoff", "best_threshold"):
return roc_curve(
self.y,
self.deploySQL(),
self.test_relation,
self.cursor,
best_threshold=True,
nbins=1000,
nbins=10000,
)
elif method in ("recall", "tpr"):
return recall_score(
Expand Down
15 changes: 15 additions & 0 deletions verticapy/tests/vModel/test_decision_tree_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,21 @@ def test_to_sklearn(self, model):

# 'predict_proba'

def test_to_python(self, model, titanic_vd):
model_test = DecisionTreeClassifier("rfc_python_test", cursor=model.cursor)
model_test.drop()
model_test.fit(titanic_vd, ["age", "fare", "sex"], "embarked")
model_test.cursor.execute(
"SELECT PREDICT_RF_CLASSIFIER(30.0, 45.0, 'male' USING PARAMETERS model_name = 'rfc_python_test', match_by_pos=True)"
)
prediction = model_test.cursor.fetchone()[0]
assert prediction == model_test.to_python(return_str=False)([[30.0, 45.0, 'male']])[0]
model_test.cursor.execute(
"SELECT PREDICT_RF_CLASSIFIER(30.0, 145.0, 'female' USING PARAMETERS model_name = 'rfc_python_test', match_by_pos=True)"
)
prediction = model_test.cursor.fetchone()[0]
assert prediction == model_test.to_python(return_str=False)([[30.0, 145.0, 'female']])[0]

def test_to_sql(self, model, titanic_vd):
model_test = DecisionTreeClassifier("rfc_sql_test", cursor=model.cursor)
model_test.drop()
Expand Down
Loading

0 comments on commit c4e8b74

Please sign in to comment.