Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package][R-package] adapt to scikit-learn 1.6 testing changes, pin more packages in R 3.6 CI jobs #6718

Merged
merged 27 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions .ci/install-old-r-packages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# [description]
#
# Installs a pinned set of packages that worked together
# as of the last R 3.6 release.
#

.install_packages <- function(packages) {
install.packages( # nolint: undesirable_function
pkgs = paste( # nolint: paste
"https://cran.r-project.org/src/contrib/Archive"
, packages
, sep = "/"
)
, dependencies = FALSE
, lib = Sys.getenv("R_LIBS")
, repos = NULL
)
}

# when confronted with a bunch of URLs like this, install.packages() sometimes
# struggles to determine install order... so install packages in batches here,
# starting from the root of the dependency graph and working up

# there was only a single release of {praise}, so there is no contrib/Archive URL for it
install.packages( # nolint: undesirable_function
pkgs = "https://cran.r-project.org/src/contrib/praise_1.0.0.tar.gz"
, dependencies = FALSE
, lib = Sys.getenv("R_LIBS")
, repos = NULL
)

.install_packages(c(
"brio/brio_1.1.4.tar.gz" # nolint: non_portable_path
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these hard-coded versions are the latest for which there's a package at https://cran.r-project.org/src/contrib/Archive. That provides a set of packages that were all working together as of a few days ago.

We shouldn't need to actively manage this list... this should be able to remain untouched (I hope) until we drop R 3 support and delete this script entirely.

, "cli/cli_3.6.2.tar.gz" # nolint: non_portable_path
, "crayon/crayon_1.5.2.tar.gz" # nolint: non_portable_path
, "digest/digest_0.6.36.tar.gz" # nolint: non_portable_path
, "evaluate/evaluate_0.23.tar.gz" # nolint: non_portable_path
, "fansi/fansi_1.0.5.tar.gz" # nolint: non_portable_path
, "fs/fs_1.6.4.tar.gz" # nolint: non_portable_path
, "glue/glue_1.7.0.tar.gz" # nolint: non_portable_path
, "jsonlite/jsonlite_1.8.8.tar.gz" # nolint: non_portable_path
, "lattice/lattice_0.20-41.tar.gz" # nolint: non_portable_path
, "magrittr/magrittr_2.0.2.tar.gz" # nolint: non_portable_path
, "pkgconfig/pkgconfig_2.0.2.tar.gz" # nolint: non_portable_path
, "ps/ps_1.8.0.tar.gz" # nolint: non_portable_path
, "R6/R6_2.5.0.tar.gz" # nolint: non_portable_path
, "rlang/rlang_1.1.3.tar.gz" # nolint: non_portable_path
, "rprojroot/rprojroot_2.0.3.tar.gz" # nolint: non_portable_path
, "utf8/utf8_1.2.3.tar.gz" # nolint: non_portable_path
, "withr/withr_3.0.1.tar.gz" # nolint: non_portable_path
))

.install_packages(c(
"desc/desc_1.4.2.tar.gz" # nolint: non_portable_path
, "diffobj/diffobj_0.3.4.tar.gz" # nolint: non_portable_path
, "lifecycle/lifecycle_1.0.3.tar.gz" # nolint: non_portable_path
, "processx/processx_3.8.3.tar.gz" # nolint: non_portable_path
))

.install_packages(c(
"callr/callr_3.7.5.tar.gz" # nolint: non_portable_path
, "vctrs/vctrs_0.6.4.tar.gz" # nolint: non_portable_path
))

.install_packages(c(
"pillar/pillar_1.8.1.tar.gz" # nolint: non_portable_path
, "tibble/tibble_3.2.0.tar.gz" # nolint: non_portable_path
))

.install_packages(c(
"pkgbuild/pkgbuild_1.4.4.tar.gz" # nolint: non_portable_path
, "rematch2/rematch2_2.1.1.tar.gz" # nolint: non_portable_path
, "waldo/waldo_0.5.3.tar.gz" # nolint: non_portable_path
))

.install_packages(c(
"pkgload/pkgload_1.3.4.tar.gz" # nolint: non_portable_path
, "testthat/testthat_3.2.1.tar.gz" # nolint: non_portable_path
))
4 changes: 2 additions & 2 deletions .ci/test-r-package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ if [[ $OS_NAME == "macos" ]]; then
export R_TIDYCMD=/usr/local/bin/tidy
fi

# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
# fix for issue where CRAN was not returning {evaluate}, {lattice}, or {waldo} when using R 3.6
# "Warning: dependency ‘lattice’ is not available"
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
Rscript --vanilla ./.ci/install-old-r-packages.R
else
# {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
# This should be unnecessary on R >=4.4.0
Expand Down
10 changes: 10 additions & 0 deletions python-package/lightgbm/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import assert_all_finite, check_array, check_X_y

# sklearn.utils Tags types can be imported unconditionally once
# lightgbm's minimum scikit-learn version is 1.6 or higher
try:
from sklearn.utils import ClassifierTags as _sklearn_ClassifierTags
from sklearn.utils import RegressorTags as _sklearn_RegressorTags
except ImportError:
_sklearn_ClassifierTags = None
_sklearn_RegressorTags = None
try:
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
Expand Down Expand Up @@ -140,6 +148,8 @@ class _LGBMRegressorBase: # type: ignore
_LGBMCheckClassificationTargets = None
_LGBMComputeSampleWeight = None
_LGBMValidateData = None
_sklearn_ClassifierTags = None
_sklearn_RegressorTags = None
Comment on lines +151 to +152
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these defined again here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because if an earlier ImportError happens, then this part above would never get evaluated:

    except ImportError:
        _sklearn_ClassifierTags = None
        _sklearn_RegressorTags = None

And then the from .compat imprt _sklearn_ClassifierTags in sklearn.py would fail. This is not new, just following the pattern that's existed in LightGBM for a long time (see all the other {something} = None above it).

Happy to consider something else if you have a recommendation for improving this!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, sorry.

The first one is defined for scikit-learn<1.6 and the second when scikit-learn isn't installed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah exactly. No need to be sorry, it's confusing!

_sklearn_version = None

# additional scikit-learn imports only for type hints
Expand Down
13 changes: 10 additions & 3 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
_LGBMModelBase,
_LGBMRegressorBase,
_LGBMValidateData,
_sklearn_ClassifierTags,
_sklearn_RegressorTags,
_sklearn_version,
dt_DataTable,
pd_DataFrame,
Expand Down Expand Up @@ -703,7 +705,6 @@ def _update_sklearn_tags_from_dict(
tags.input_tags.allow_nan = tags_dict["allow_nan"]
tags.input_tags.sparse = "sparse" in tags_dict["X_types"]
tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"]
tags._xfail_checks = tags_dict["_xfail_checks"]
return tags

def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
Expand Down Expand Up @@ -1291,7 +1292,10 @@ def _more_tags(self) -> Dict[str, Any]:
return tags

def __sklearn_tags__(self) -> "_sklearn_Tags":
return LGBMModel.__sklearn_tags__(self)
tags = LGBMModel.__sklearn_tags__(self)
tags.estimator_type = "regressor"
tags.regressor_tags = _sklearn_RegressorTags(multi_label=False)
return tags

def fit( # type: ignore[override]
self,
Expand Down Expand Up @@ -1350,7 +1354,10 @@ def _more_tags(self) -> Dict[str, Any]:
return tags

def __sklearn_tags__(self) -> "_sklearn_Tags":
return LGBMModel.__sklearn_tags__(self)
tags = LGBMModel.__sklearn_tags__(self)
tags.estimator_type = "classifier"
tags.classifier_tags = _sklearn_ClassifierTags(multi_class=True, multi_label=False)
return tags

def fit( # type: ignore[override]
self,
Expand Down
38 changes: 34 additions & 4 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,18 @@
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.utils.estimator_checks import parametrize_with_checks as sklearn_parametrize_with_checks
from sklearn.utils.validation import check_is_fitted

import lightgbm as lgb
from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
from lightgbm.compat import (
DATATABLE_INSTALLED,
PANDAS_INSTALLED,
_sklearn_version,
dt_DataTable,
pd_DataFrame,
pd_Series,
)

from .utils import (
assert_silent,
Expand All @@ -35,6 +42,9 @@
softmax,
)

SKLEARN_MAJOR, SKLEARN_MINOR, *_ = _sklearn_version.split(".")
SKLEARN_VERSION_GTE_1_6 = int(SKLEARN_MAJOR) >= 1 and int(SKLEARN_MINOR) >= 6
jameslamb marked this conversation as resolved.
Show resolved Hide resolved

decreasing_generator = itertools.count(0, -1)
estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker)
task_to_model_factory = {
Expand Down Expand Up @@ -1432,7 +1442,28 @@ def test_getting_feature_names_in_pd_input(estimator_class):
np.testing.assert_array_equal(model.feature_names_in_, X.columns)


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
# Starting with scikit-learn 1.6 (https://github.com/scikit-learn/scikit-learn/pull/30149),
# the only API for marking estimator tests as expected to fail is to pass a keyword argument
# to parametrize_with_checks(). That function didn't accept additional arguments in earlier
# versions.
#
# This block defines a patched version of parametrize_with_checks() so lightgbm's tests
# can be compatible with scikit-learn <1.6 and >=1.6.
#
# This should be removed minimum supported scikit-learn version is at least 1.6.
jameslamb marked this conversation as resolved.
Show resolved Hide resolved
if SKLEARN_VERSION_GTE_1_6:
parametrize_with_checks = sklearn_parametrize_with_checks
else:

def parametrize_with_checks(estimator, *args, **kwargs):
return sklearn_parametrize_with_checks(estimator)


def _get_expected_failed_tests(estimator):
return estimator._more_tags()["_xfail_checks"]


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests)
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
check(estimator)
Expand All @@ -1457,7 +1488,6 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
assert sklearn_tags.input_tags.allow_nan is True
assert sklearn_tags.input_tags.sparse is True
assert sklearn_tags.target_tags.one_d_labels is True
assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]


@pytest.mark.parametrize("task", all_tasks)
Expand Down
Loading