Skip to content

Commit

Permalink
Merge pull request #21 from uclamii/xgb_early_bug_estimators
Browse files Browse the repository at this point in the history
Fixing bug related to n_estimators not being used
  • Loading branch information
elemets committed Jul 11, 2024
2 parents 207448e + 832a7e8 commit d3756c2
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 12 deletions.
21 changes: 15 additions & 6 deletions notebooks/binary_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@
"xgbearly = True\n",
"\n",
"tuned_parameters = {\n",
" f\"{estimator_name}__max_depth\": [3, 200],\n",
" f\"{estimator_name}__max_depth\": [3],\n",
" f\"{estimator_name}__learning_rate\": [1e-4],\n",
" f\"{estimator_name}__n_estimators\": [100000],\n",
" f\"{estimator_name}__early_stopping_rounds\": [10],\n",
" f\"{estimator_name}__verbose\": [False],\n",
" f\"{estimator_name}__early_stopping_rounds\": [2],\n",
" f\"{estimator_name}__verbose\": [True],\n",
" f\"{estimator_name}__eval_metric\": [\"logloss\"],\n",
"}"
]
Expand All @@ -182,7 +182,7 @@
" stratify_y=True,\n",
" grid=tuned_parameters,\n",
" randomized_grid=True,\n",
" n_iter=40,\n",
" n_iter=1,\n",
" xgboost_early=True,\n",
" scoring=[\"roc_auc\"],\n",
" n_splits=10,\n",
Expand All @@ -198,7 +198,7 @@
"X_test, y_test = model.get_test_data(X, y)\n",
"X_valid, y_valid = model.get_valid_data(X, y)\n",
"\n",
"model.fit(X_train, y_train)\n",
"model.fit(X_train, y_train, validation_data=[X_valid, y_valid])\n",
"\n",
"print(\"Validation Metrics\")\n",
"model.return_metrics(X_valid, y_valid)\n",
Expand All @@ -211,6 +211,15 @@
"y_pred = model.predict(X_test, optimal_threshold=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.best_params_per_score"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -325,7 +334,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.7.12"
}
},
"nbformat": 4,
Expand Down
78 changes: 78 additions & 0 deletions notebooks/xgb_early_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pandas as pd
import numpy as np
import os
import sys

from sklearn.datasets import make_classification

from sklearn.datasets import load_breast_cancer

from model_tuner.model_tuner_utils import Model
from model_tuner.bootstrapper import evaluate_bootstrap_metrics
from model_tuner.pickleObjects import dumpObjects, loadObjects

bc = load_breast_cancer(as_frame=True)["frame"]
bc_cols = [cols for cols in bc.columns if "target" not in cols]
X = bc[bc_cols]
y = bc["target"]

from xgboost import XGBClassifier


estimator = XGBClassifier(
objective="binary:logistic",
)

estimator_name = "xgb"
xgbearly = True

tuned_parameters = {
f"{estimator_name}__max_depth": [3, 10, 20, 200, 500],
f"{estimator_name}__learning_rate": [1e-4],
f"{estimator_name}__n_estimators": [100000],
f"{estimator_name}__early_stopping_rounds": [10],
f"{estimator_name}__verbose": [True],
f"{estimator_name}__eval_metric": ["logloss"],
}

kfold = False
calibrate = False

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Model(
name="XGBoost Early",
estimator_name=estimator_name,
calibrate=calibrate,
estimator=estimator,
kfold=kfold,
stratify_y=True,
grid=tuned_parameters,
randomized_grid=True,
n_iter=1,
xgboost_early=True,
scoring=["roc_auc"],
n_splits=10,
selectKBest=False,
n_jobs=-2,
random_state=42,
)


model.grid_search_param_tuning(X, y)

X_train, y_train = model.get_train_data(X, y)
X_test, y_test = model.get_test_data(X, y)
X_valid, y_valid = model.get_valid_data(X, y)

model.fit(X_train, y_train, validation_data=[X_valid, y_valid])

print("Validation Metrics")
model.return_metrics(X_valid, y_valid)
print("Test Metrics")
model.return_metrics(X_test, y_test)

y_prob = model.predict_proba(X_test)

### F1 Weighted
y_pred = model.predict(X_test, optimal_threshold=True)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="model_tuner",
version="0.0.08a",
version="0.0.09a",
author="UCLA CTSI ML Team: Leonid Shpaner, Arthur Funnell, Panayiotis Petousis",
author_email="[email protected]",
description="A Python library for tuning Machine Learning models.",
Expand Down
2 changes: 1 addition & 1 deletion src/model_tuner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "0.0.08a"
__version__ = "0.0.09a"

from .main import *
27 changes: 23 additions & 4 deletions src/model_tuner/model_tuner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,6 @@ def calibrateModel(

if self.imbalance_sampler:
self.process_imbalance_sampler(X_train, y_train)

else:
self.fit(X_train, y_train)
# calibrate model, and save output
Expand Down Expand Up @@ -749,7 +748,7 @@ def grid_search_param_tuning(
self.process_imbalance_sampler(X_train, y_train)
for score in self.scoring:
scores = []
for params in tqdm(self.grid):
for index, params in enumerate(tqdm(self.grid)):
if self.xgboost_early:
estimator_verbosity = f"{self.estimator_name}__verbose"

Expand Down Expand Up @@ -801,16 +800,36 @@ def grid_search_param_tuning(
clf = self.estimator.set_params(**params).fit(
X_train, y_train, **xgb_params
)

### extracting the best parameters found through early stopping
best_early_stopping_params = clf.named_steps[
self.estimator_name
].get_params()

### updating the params in the param grid with these updated parameters
for (
param_name,
param_value,
) in best_early_stopping_params.items():
if param_name in params:
params[param_name] = param_value

params[f"{self.estimator_name}__n_estimators"] = clf[
len(clf) - 1
].best_iteration

# Update the parameters in the grid
self.grid[index] = params

else:
clf = self.estimator.set_params(**params).fit(X_train, y_train)

if score in self.custom_scorer:
scorer_func = self.custom_scorer[score]
else:
scorer_func = get_scorer(score)

score_value = scorer_func(clf, X_valid, y_valid)
# if custom_scorer
scores.append(score_value)

self.best_params_per_score[score] = {
Expand Down

0 comments on commit d3756c2

Please sign in to comment.