Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Contrib: added xgboost-optuna contrib #479

Merged
merged 1 commit into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions contrib/hamilton/contrib/user/zilto/xgboost_optuna/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Purpose of this module

This module implements a dataflow to train an XGBoost model with hyperparameter tuning using Optuna.

You give it a 2D arrays for `X_train`, `y_train`, `X_test`, `y_test` and you are good to go!

# Configuration Options
The Hamilton driver can be configured with the following options:
- {"task": "classification"} to use xgboost.XGBClassifier.
- {"task": "regression"} to use xgboost.XGBRegressor.

There are several relevant inputs and override points.

**Inputs**:
- `model_config_override`: Pass a dictionary to override the XGBoost default config. **Warning** passing a `model_config_override = {"objective": "binary:logistic}` to an `XGBRegressor` effectively changes it to an `XGBClassifier`
- `optuna_distributions_override`: Pass a dictionary of optuna distributions to define the hyperparameter search space.

**Overrides**:
- `base_model`: can change it to the type `xgboost.XGBRanker` for a ranking task or `xgboost.dask.DaskXGBClassifier` to support Dask
- `scoring_func`: can be any `sklearn.metrics` function that accepts `y_true` and `y_pred` as arguments. Remember to set accordingly `higher_is_better` for the optimization task
- `cross_validation_folds`: can be any sequence of tuples that define (`train_index`, `validation_index`) to train the model with cross-validation over `X_train`

# Limitations

- It is difficult to adapt for distributed Optuna hyperparameter search.
- The current structure makes it difficult to add custom training callbacks to the XGBoost model (can be done to some extent via `model_config_override`).
251 changes: 251 additions & 0 deletions contrib/hamilton/contrib/user/zilto/xgboost_optuna/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
import logging
from types import FunctionType
from typing import Any, Callable, Optional, Sequence

logger = logging.getLogger(__name__)

from hamilton import contrib

with contrib.catch_import_errors(__name__, __file__, logger):
import numpy as np
import pandas as pd
import xgboost
import optuna
from optuna.distributions import IntDistribution, FloatDistribution
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

from hamilton.function_modifiers import config, extract_fields


def model_config(seed: int = 0, model_config_override: Optional[dict] = None) -> dict:
"""XGBoost model configuration
ref: https://xgboost.readthedocs.io/en/stable/parameter.html

:param model_config_override: Model configuration arguments
"""
config = dict(
device="cpu",
booster="gbtree",
learning_rate=0.05, # alias: eta; typical 0.01 to 0.2
max_depth=3, # typical 3 to 10; will lead to overfitting
gamma=0.1, # alias: min_split_loss; 0 to +inf
n_estimators=200,
colsample_bytree=1, # typical 0.5 to 1
subsample=1, # typical 0.6 to 1
min_child_weight=1, # 0 to +inf; prevent overfitting; too high underfit
max_delta_step=0, # 0 is no constraint; used in imbalanced logistic reg; typical 1 to 10;
reg_alpha=0, # alias alpha; default 0
reg_lambda=1, # alias lambda; default 1
tree_method="hist",
enable_categorical=True,
max_cat_to_onehot=None,
verbosity=2, # 0: silent, 1: warning, 2: info, 3: debug
early_stopping_rounds=20,
)
if model_config_override:
config.update(**model_config_override)

config.update(seed=seed)

return config


def optuna_distributions(optuna_distributions_override: Optional[dict] = None) -> dict:
"""Distributions of hyperparameters to search during optimization

:param optuna_distributions_override: Hyperparameter distributions to explore
"""
config = dict(
n_estimators=IntDistribution(low=250, high=700, step=150),
learning_rate=FloatDistribution(low=0.01, high=0.2, log=True),
max_depth=IntDistribution(low=3, high=10),
gamma=FloatDistribution(low=0.01, high=20, log=True),
colsample_bytree=FloatDistribution(low=0.5, high=1),
min_child_weight=IntDistribution(low=1, high=20, log=True),
max_delta_step=IntDistribution(low=0, high=10),
)
if optuna_distributions_override:
config.update(**optuna_distributions_override)
return config


@config.when(task="classification")
def base_model__classification() -> Callable:
"""Class to instantiate classification model"""
return xgboost.XGBClassifier


@extract_fields(
dict(
scoring_func=FunctionType,
higher_is_better=bool,
)
)
@config.when(task="classification")
def scorer__classification() -> dict:
"""Default scoring function for classification"""
return dict(
scoring_func=accuracy_score,
higher_is_better=True,
)


@config.when(task="regression")
def base_model__regression() -> Callable:
"""Class to instantiate regression model"""
return xgboost.XGBRegressor


@extract_fields(
dict(
scoring_func=FunctionType,
higher_is_better=bool,
)
)
@config.when(task="regression")
def scorer__regression() -> dict:
"""Default scoring function for regression"""
return dict(
scoring_func=mean_squared_error,
higher_is_better=False,
)


def cross_validation_folds(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray | pd.DataFrame,
n_cv_folds: int = 3,
shuffle: bool = True,
stratify: bool = True,
seed: int = 0,
) -> Sequence[tuple]: # [Sequence[int], Sequence[int]]]:
"""Get a list of tuples (train_idx, validation_idx)
Override at the Hamilton execution level to support any cross-validation strategy
"""
if stratify:
kfold = StratifiedKFold(n_splits=n_cv_folds, shuffle=shuffle, random_state=seed)
else:
kfold = KFold(n_splits=n_cv_folds, shuffle=shuffle, random_state=seed)

return list(kfold.split(X_train, y_train))


def study(
higher_is_better: bool,
pruner: Optional[optuna.pruners.BasePruner] = optuna.pruners.MedianPruner(),
sampler: Optional[optuna.samplers.BaseSampler] = None,
study_storage: Optional[str] = None,
study_name: Optional[str] = None,
load_if_exists: bool = False,
) -> optuna.study.Study:
"""Create an optuna study; use the XGBoost + Optuna integration for pruning
ref: https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_integration.py
"""
return optuna.create_study(
direction="maximize" if higher_is_better else "minimize",
pruner=pruner,
sampler=sampler,
study_name=study_name,
storage=study_storage,
load_if_exists=load_if_exists,
)


@extract_fields(
dict(
study_results=optuna.study.Study,
best_hyperparameters=dict,
)
)
def hyperparameter_search(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray | pd.DataFrame,
cross_validation_folds: Sequence[tuple], # Sequence[tuple[Sequence[int], Sequence[int]]],
base_model: Callable,
model_config: dict,
scoring_func: FunctionType,
optuna_distributions: dict,
study: optuna.study.Study,
n_optuna_trials: int = 10,
) -> dict:
"""Search over the optuna distributions for n trials, trying to achieve
the best validation score.
"""
for _ in range(n_optuna_trials):
trial = study.ask(optuna_distributions)

fold_scores = []
for train_fold, validation_fold in cross_validation_folds:
if isinstance(X_train, pd.DataFrame):
X_train_fold = X_train.iloc[train_fold]
X_validation_fold = X_train.iloc[validation_fold]
else:
X_train_fold = X_train[train_fold]
X_validation_fold = X_train[validation_fold]

if isinstance(y_train, pd.DataFrame):
y_train_fold = y_train.iloc[train_fold]
y_validation_fold = y_train.iloc[validation_fold]
else:
y_train_fold = y_train[train_fold]
y_validation_fold = y_train[validation_fold]

model = base_model(**model_config)
model.set_params(**trial.params)

model.fit(
X_train_fold,
y_train_fold,
eval_set=[(X_validation_fold, y_validation_fold)],
verbose=False,
)

y_validation_fold_pred = model.predict(X_validation_fold)
score = scoring_func(y_true=y_validation_fold, y_pred=y_validation_fold_pred)

fold_scores.append(score)

study.tell(trial, np.mean(fold_scores))

return dict(
study_results=study,
best_hyperparameters=study.best_params,
)


def study_results_df(study_results: optuna.study.Study) -> pd.DataFrame:
"""Return the summary of the optuna study as a dataframe"""
return study_results.trials_dataframe()


def best_model(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray | pd.DataFrame,
base_model: Callable,
model_config: dict,
best_hyperparameters: dict,
) -> xgboost.XGBModel:
"""Train a model with the best hyperparameters"""
model = base_model(**model_config)
model = model.set_params(early_stopping_rounds=None, **best_hyperparameters)
model.fit(X_train, y_train)

return model


def y_test_pred(
best_model: xgboost.XGBModel,
X_test: np.ndarray | pd.DataFrame,
) -> np.ndarray:
"""Get predictions from the best model on the test set"""
return best_model.predict(X_test)


def test_score(
y_test: np.ndarray | pd.DataFrame,
y_test_pred: np.ndarray,
scoring_func: FunctionType,
) -> Any:
"""Score the test predictions from the best model"""
return scoring_func(y_true=y_test, y_pred=y_test_pred)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy
optuna
pandas
scikit-learn
sf-hamilton[visualization]
xgboost
Loading
Loading