From 2eea80f04dd01f014b67a721a4bb114ee589ed7a Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Fri, 28 Jan 2022 17:34:15 +0900 Subject: [PATCH] [rebase] Rebase to the latest version and merge test_evaluator to train_evaluator Since test_evaluator can be merged, I merged it. * [rebase] Rebase and merge the changes in non-test files without issues * [refactor] Merge test- and train-evaluator * [fix] Fix the import error due to the change xxx_evaluator --> evaluator * [test] Fix errors in tests * [fix] Fix the handling of test pred in no resampling * [refactor] Move save_y_opt=False for no resampling deepter for simplicity * [test] Increase the budget size for no resample tests --- autoPyTorch/api/base_task.py | 10 +- autoPyTorch/api/tabular_classification.py | 2 +- autoPyTorch/api/tabular_regression.py | 2 +- autoPyTorch/datasets/resampling_strategy.py | 8 + autoPyTorch/evaluation/abstract_evaluator.py | 2 +- .../{train_evaluator.py => evaluator.py} | 116 +++---- autoPyTorch/evaluation/tae.py | 84 ++--- autoPyTorch/optimizer/smbo.py | 8 +- test/test_api/test_api.py | 304 ++++------------- test/test_api/utils.py | 34 +- test/test_evaluation/test_evaluators.py | 316 +++++++----------- test/test_evaluation/test_tae.py | 2 +- .../test_tabular_classification.py | 13 + 13 files changed, 298 insertions(+), 603 deletions(-) rename autoPyTorch/evaluation/{train_evaluator.py => evaluator.py} (72%) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index eae9f3f70..88227df85 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -315,7 +315,7 @@ def _get_dataset_input_validator( Testing feature set y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set - resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + resampling_strategy (Optional[ResamplingStrategies]): Strategy to split the training data. if None, uses HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): @@ -355,7 +355,7 @@ def get_dataset( Testing feature set y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set - resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + resampling_strategy (Optional[ResamplingStrategies]): Strategy to split the training data. if None, uses HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): @@ -973,7 +973,7 @@ def _search( `SMAC `_. tae_func (Optional[Callable]): TargetAlgorithm to be optimised. If None, `eval_function` - available in autoPyTorch/evaluation/train_evaluator is used. + available in autoPyTorch/evaluation/evaluator is used. Must be child class of AbstractEvaluator. all_supported_metrics (bool: default=True): If True, all metrics supporting current task will be calculated @@ -1380,7 +1380,7 @@ def fit_pipeline( X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, dataset_name: Optional[str] = None, - resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, run_time_limit_secs: int = 60, memory_limit: Optional[int] = None, @@ -1415,7 +1415,7 @@ def fit_pipeline( be provided to track the generalization performance of each stage. dataset_name (Optional[str]): Name of the dataset, if None, random value is used. - resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + resampling_strategy (Optional[ResamplingStrategies]): Strategy to split the training data. if None, uses HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 03519bef8..d1cbe12d9 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -330,7 +330,7 @@ def search( `SMAC `_. tae_func (Optional[Callable]): TargetAlgorithm to be optimised. If None, `eval_function` - available in autoPyTorch/evaluation/train_evaluator is used. + available in autoPyTorch/evaluation/evaluator is used. Must be child class of AbstractEvaluator. all_supported_metrics (bool: default=True): If True, all metrics supporting current task will be calculated diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 8c0637e39..9b71c436e 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -331,7 +331,7 @@ def search( `SMAC `_. tae_func (Optional[Callable]): TargetAlgorithm to be optimised. If None, `eval_function` - available in autoPyTorch/evaluation/train_evaluator is used. + available in autoPyTorch/evaluation/evaluator is used. Must be child class of AbstractEvaluator. all_supported_metrics (bool: default=True): If True, all metrics supporting current task will be calculated diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 78447a04e..e09747258 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -93,6 +93,14 @@ def is_stratified(self) -> bool: # TODO: replace it with another way ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes] + +def check_resampling_strategy(resampling_strategy: Optional[ResamplingStrategies]) -> None: + choices = (CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes) + if not isinstance(resampling_strategy, choices): + rs_names = (rs.__mro__[0].__name__ for rs in choices) + raise ValueError(f'resampling_strategy must be in {rs_names}, but got {resampling_strategy}') + + DEFAULT_RESAMPLING_PARAMETERS: Dict[ ResamplingStrategies, Dict[str, Any] diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index b0d5a433f..c56141904 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -207,7 +207,7 @@ def __init__(self, backend: Backend, An evaluator is an object that: + constructs a pipeline (i.e. a classification or regression estimator) for a given pipeline_config and run settings (budget, seed) - + Fits and trains this pipeline (TrainEvaluator) or tests a given + + Fits and trains this pipeline (Evaluator) or tests a given configuration (TestEvaluator) The provided configuration determines the type of pipeline created. For more diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/evaluator.py similarity index 72% rename from autoPyTorch/evaluation/train_evaluator.py rename to autoPyTorch/evaluation/evaluator.py index 62c02029f..a6eef9b2b 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/evaluator.py @@ -7,12 +7,11 @@ from smac.tae import StatusType -from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.constants import ( - CLASSIFICATION_TASKS, - MULTICLASSMULTIOUTPUT, +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, + NoResamplingStrategyTypes, + check_resampling_strategy ) -from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes from autoPyTorch.evaluation.abstract_evaluator import ( AbstractEvaluator, EvaluationResults, @@ -21,7 +20,8 @@ from autoPyTorch.evaluation.abstract_evaluator import EvaluatorParams, FixedPipelineParams from autoPyTorch.utils.common import dict_repr, subsampler -__all__ = ['TrainEvaluator', 'eval_train_function'] +__all__ = ['Evaluator', 'eval_fn'] + class _CrossValidationResultsManager: def __init__(self, num_folds: int): @@ -83,15 +83,13 @@ def get_result_dict(self) -> Dict[str, Any]: ) -class TrainEvaluator(AbstractEvaluator): +class Evaluator(AbstractEvaluator): """ This class builds a pipeline using the provided configuration. A pipeline implementing the provided configuration is fitted using the datamanager object retrieved from disc, via the backend. After the pipeline is fitted, it is save to disc and the performance estimate - is communicated to the main process via a Queue. It is only compatible - with `CrossValTypes`, `HoldoutValTypes`, i.e, when the training data - is split and the validation set is used for SMBO optimisation. + is communicated to the main process via a Queue. Args: queue (Queue): @@ -101,43 +99,17 @@ class TrainEvaluator(AbstractEvaluator): Fixed parameters for a pipeline evaluator_params (EvaluatorParams): The parameters for an evaluator. + + Attributes: + train (bool): + Whether the training data is split and the validation set is used for SMBO optimisation. + cross_validation (bool): + Whether we use cross validation or not. """ - def __init__(self, backend: Backend, queue: Queue, - metric: autoPyTorchMetric, - budget: float, - configuration: Union[int, str, Configuration], - budget_type: str = None, - pipeline_config: Optional[Dict[str, Any]] = None, - seed: int = 1, - output_y_hat_optimization: bool = True, - num_run: Optional[int] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, - init_params: Optional[Dict[str, Any]] = None, - logger_port: Optional[int] = None, - keep_models: Optional[bool] = None, - all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: - super().__init__( - backend=backend, - queue=queue, - configuration=configuration, - metric=metric, - seed=seed, - output_y_hat_optimization=output_y_hat_optimization, - num_run=num_run, - include=include, - exclude=exclude, - disable_file_output=disable_file_output, - init_params=init_params, - budget=budget, - budget_type=budget_type, - logger_port=logger_port, - all_supported_metrics=all_supported_metrics, - pipeline_config=pipeline_config, - search_space_updates=search_space_updates - ) + def __init__(self, queue: Queue, fixed_pipeline_params: FixedPipelineParams, evaluator_params: EvaluatorParams): + resampling_strategy = fixed_pipeline_params.backend.load_datamanager().resampling_strategy + self.train = not isinstance(resampling_strategy, NoResamplingStrategyTypes) + self.cross_validation = isinstance(resampling_strategy, CrossValTypes) if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)): raise ValueError( @@ -175,7 +147,7 @@ def _evaluate_on_split(self, split_id: int) -> EvaluationResults: return EvaluationResults( pipeline=pipeline, - opt_loss=self._loss(labels=self.y_train[opt_split], preds=opt_pred), + opt_loss=self._loss(labels=self.y_train[opt_split] if self.train else self.y_test, preds=opt_pred), train_loss=self._loss(labels=self.y_train[train_split], preds=train_pred), opt_pred=opt_pred, valid_pred=valid_pred, @@ -201,6 +173,7 @@ def _cross_validation(self) -> EvaluationResults: results = self._evaluate_on_split(split_id) self.pipelines[split_id] = results.pipeline + assert opt_split is not None # mypy redefinition cv_results.update(split_id, results, len(train_split), len(opt_split)) self.y_opt = np.concatenate([y_opt for y_opt in Y_opt if y_opt is not None]) @@ -212,15 +185,16 @@ def evaluate_loss(self) -> None: if self.splits is None: raise ValueError(f"cannot fit pipeline {self.__class__.__name__} with datamanager.splits None") - if self.num_folds == 1: + if self.cross_validation: + results = self._cross_validation() + else: _, opt_split = self.splits[0] results = self._evaluate_on_split(split_id=0) - self.y_opt, self.pipelines[0] = self.y_train[opt_split], results.pipeline - else: - results = self._cross_validation() + self.pipelines[0] = results.pipeline + self.y_opt = self.y_train[opt_split] if self.train else self.y_test self.logger.debug( - f"In train evaluator.evaluate_loss, num_run: {self.num_run}, loss:{results.opt_loss}," + f"In evaluate_loss, num_run: {self.num_run}, loss:{results.opt_loss}," f" status: {results.status},\nadditional run info:\n{dict_repr(results.additional_run_info)}" ) self.record_evaluation(results=results) @@ -240,41 +214,23 @@ def _fit_and_evaluate_loss( kwargs = {'pipeline': pipeline, 'unique_train_labels': self.unique_train_labels[split_id]} train_pred = self.predict(subsampler(self.X_train, train_indices), **kwargs) - opt_pred = self.predict(subsampler(self.X_train, opt_indices), **kwargs) - valid_pred = self.predict(self.X_valid, **kwargs) test_pred = self.predict(self.X_test, **kwargs) + valid_pred = self.predict(self.X_valid, **kwargs) + + # No resampling ===> evaluate on test dataset + opt_pred = self.predict(subsampler(self.X_train, opt_indices), **kwargs) if self.train else test_pred assert train_pred is not None and opt_pred is not None # mypy check return train_pred, opt_pred, valid_pred, test_pred -# create closure for evaluating an algorithm -def eval_train_function( - backend: Backend, - queue: Queue, - metric: autoPyTorchMetric, - budget: float, - config: Optional[Configuration], - seed: int, - output_y_hat_optimization: bool, - num_run: int, - include: Optional[Dict[str, Any]], - exclude: Optional[Dict[str, Any]], - disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, - pipeline_config: Optional[Dict[str, Any]] = None, - budget_type: str = None, - init_params: Optional[Dict[str, Any]] = None, - logger_port: Optional[int] = None, - all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - instance: str = None, -) -> None: +def eval_fn(queue: Queue, fixed_pipeline_params: FixedPipelineParams, evaluator_params: EvaluatorParams) -> None: """ This closure allows the communication between the TargetAlgorithmQuery and the - pipeline trainer (TrainEvaluator). + pipeline trainer (Evaluator). Fundamentally, smac calls the TargetAlgorithmQuery.run() method, which internally - builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files + builds an Evaluator. The Evaluator builds a pipeline, stores the output files to disc via the backend, and puts the performance result of the run in the queue. Args: @@ -286,7 +242,11 @@ def eval_train_function( evaluator_params (EvaluatorParams): The parameters for an evaluator. """ - evaluator = TrainEvaluator( + resampling_strategy = fixed_pipeline_params.backend.load_datamanager().resampling_strategy + check_resampling_strategy(resampling_strategy) + + # NoResamplingStrategyTypes ==> test evaluator, otherwise ==> train evaluator + evaluator = Evaluator( queue=queue, evaluator_params=evaluator_params, fixed_pipeline_params=fixed_pipeline_params diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 2203e35a8..bded4b701 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -24,13 +24,8 @@ from smac.tae.execute_func import AbstractTAFunc from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, - HoldoutValTypes, - NoResamplingStrategyTypes -) -from autoPyTorch.evaluation.test_evaluator import eval_test_function -from autoPyTorch.evaluation.train_evaluator import eval_train_function +from autoPyTorch.evaluation.abstract_evaluator import EvaluatorParams, FixedPipelineParams +from autoPyTorch.evaluation.evaluator import eval_fn from autoPyTorch.evaluation.utils import ( DisableFileOutputParameters, empty_queue, @@ -65,6 +60,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> PynisherResultsType: raise NotImplementedError +# Since PynisherFunctionWrapperLikeType is not the exact type, we added Any... PynisherFunctionWrapperType = Union[Any, PynisherFunctionWrapperLikeType] @@ -102,7 +98,7 @@ def _get_eval_fn(cost_for_crash: float, target_algorithm: Optional[Callable] = N else: return functools.partial( run_target_algorithm_with_exception_handling, - ta=autoPyTorch.evaluation.train_evaluator.eval_fn, + ta=eval_fn, cost_for_crash=cost_for_crash, ) @@ -272,28 +268,9 @@ def __init__( all_supported_metrics: bool = True, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): - - self.backend = backend - - dm = self.backend.load_datamanager() - if dm.val_tensors is not None: - self._get_validation_loss = True - else: - self._get_validation_loss = False - if dm.test_tensors is not None: - self._get_test_loss = True - else: - self._get_test_loss = False - - self.resampling_strategy = dm.resampling_strategy - self.resampling_strategy_args = dm.resampling_strategy_args - - if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)): - eval_function = eval_train_function - self.output_y_hat_optimization = output_y_hat_optimization - elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): - eval_function = eval_test_function - self.output_y_hat_optimization = False + dm = backend.load_datamanager() + self._exist_val_tensor = (dm.val_tensors is not None) + self._exist_test_tensor = (dm.test_tensors is not None) self.worst_possible_result = cost_for_crash @@ -306,43 +283,48 @@ def __init__( abort_on_first_run_crash=abort_on_first_run_crash, ) + # TODO: Modify so that we receive fixed_params from outside + self.fixed_pipeline_params = FixedPipelineParams.with_default_pipeline_config( + pipeline_config=pipeline_config, + backend=backend, + seed=seed, + metric=metric, + save_y_opt=save_y_opt, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + search_space_updates=search_space_updates, + ) self.pynisher_context = pynisher_context self.initial_num_run = initial_num_run - self.metric = metric - self.include = include - self.exclude = exclude - self.disable_file_output = disable_file_output self.init_params = init_params self.logger = _get_logger(logger_port, 'TAE') self.memory_limit = int(math.ceil(memory_limit)) if memory_limit is not None else memory_limit - dm = backend.load_datamanager() - self._exist_val_tensor = (dm.val_tensors is not None) - self._exist_test_tensor = (dm.test_tensors is not None) - @property def eval_fn(self) -> Callable: # this is a target algorithm defined in AbstractTAFunc during super().__init__(ta) return self.ta # type: ignore - self.search_space_updates = search_space_updates + @property + def budget_type(self) -> str: + # budget is defined by epochs by default + return self.fixed_pipeline_params.budget_type def _check_and_get_default_budget(self) -> float: budget_type_choices = ('epochs', 'runtime') + pipeline_config = self.fixed_pipeline_params.pipeline_config budget_choices = { - budget_type: float(self.pipeline_config.get(budget_type, np.inf)) + budget_type: float(pipeline_config.get(budget_type, np.inf)) for budget_type in budget_type_choices } - # budget is defined by epochs by default - budget_type = str(self.pipeline_config.get('budget_type', 'epochs')) - if self.budget_type is not None: - budget_type = self.budget_type - - if budget_type not in budget_type_choices: - raise ValueError(f"budget type must be in {budget_type_choices}, but got {budget_type}") + if self.budget_type not in budget_type_choices: + raise ValueError(f"budget type must be in {budget_type_choices}, but got {self.budget_type}") else: - return budget_choices[budget_type] + return budget_choices[self.budget_type] def run_wrapper(self, run_info: RunInfo) -> Tuple[RunInfo, RunValue]: """ @@ -363,12 +345,10 @@ def run_wrapper(self, run_info: RunInfo) -> Tuple[RunInfo, RunValue]: is_intensified = (run_info.budget != 0) default_budget = self._check_and_get_default_budget() - if self.budget_type is None and is_intensified: - raise ValueError(f'budget must be 0 (=no intensification) for budget_type=None, but got {run_info.budget}') - if self.budget_type is not None and run_info.budget < 0: + if run_info.budget < 0: raise ValueError(f'budget must be greater than zero but got {run_info.budget}') - if self.budget_type is not None and not is_intensified: + if not is_intensified: # The budget will be provided in train evaluator when budget_type is None run_info = run_info._replace(budget=default_budget) diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 1a13a048d..60d319d99 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -1,7 +1,7 @@ import copy import json import logging.handlers -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple import ConfigSpace from ConfigSpace.configuration_space import Configuration @@ -22,7 +22,7 @@ CrossValTypes, DEFAULT_RESAMPLING_PARAMETERS, HoldoutValTypes, - NoResamplingStrategyTypes + ResamplingStrategies ) from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager from autoPyTorch.evaluation.tae import TargetAlgorithmQuery @@ -98,9 +98,7 @@ def __init__(self, pipeline_config: Dict[str, Any], start_num_run: int = 1, seed: int = 1, - resampling_strategy: Union[HoldoutValTypes, - CrossValTypes, - NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 747688168..69ec9f321 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -3,7 +3,7 @@ import pickle import tempfile import unittest -from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_train_function +from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_fn import ConfigSpace as CS from ConfigSpace.configuration_space import Configuration @@ -40,44 +40,9 @@ HOLDOUT_NUM_SPLITS = 1 -# Test -# ==== -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', - new=dummy_eval_train_function) -@pytest.mark.parametrize('openml_id', (40981, )) -@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', - ((HoldoutValTypes.holdout_validation, None), - (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}) - )) -def test_tabular_classification(openml_id, resampling_strategy, backend, resampling_strategy_args, n_samples): - - # Get the data and check that contents of data-manager make sense - X, y = sklearn.datasets.fetch_openml( - data_id=int(openml_id), - return_X_y=True, as_frame=True - ) - X, y = X.iloc[:n_samples], y.iloc[:n_samples] - - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=42) - - # Search for a good configuration - estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args, - seed=42, - ) - - with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction): - estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=40, - func_eval_time_limit_secs=10, - enable_traditional_pipeline=False, - ) +def _get_dataset(openml_id: int, n_samples: int, seed: int = 42, split: bool = True): + X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id), return_X_y=True, as_frame=True) + X, y = X[:n_samples], y[:n_samples] if split: X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=seed) @@ -86,24 +51,27 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl return X, y -def _check_created_files(estimator): +def _check_created_files(estimator, no_resampling): tmp_dir = estimator._backend.temporary_directory loaded_datamanager = estimator._backend.load_datamanager() assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors) expected_files = [ - 'smac3-output/run_42/configspace.json', - 'smac3-output/run_42/runhistory.json', - 'smac3-output/run_42/scenario.txt', - 'smac3-output/run_42/stats.json', - 'smac3-output/run_42/train_insts.txt', - 'smac3-output/run_42/trajectory.json', - '.autoPyTorch/datamanager.pkl', - '.autoPyTorch/ensemble_read_preds.pkl', - '.autoPyTorch/start_time_42', - '.autoPyTorch/ensemble_history.json', - '.autoPyTorch/ensemble_read_losses.pkl', - '.autoPyTorch/true_targets_ensemble.npy', + fn + for fn in [ + 'smac3-output/run_42/configspace.json', + 'smac3-output/run_42/runhistory.json', + 'smac3-output/run_42/scenario.txt', + 'smac3-output/run_42/stats.json', + 'smac3-output/run_42/train_insts.txt', + 'smac3-output/run_42/trajectory.json', + '.autoPyTorch/datamanager.pkl', + '.autoPyTorch/start_time_42', + '.autoPyTorch/ensemble_read_preds.pkl' if not no_resampling else None, + '.autoPyTorch/ensemble_history.json' if not no_resampling else None, + '.autoPyTorch/ensemble_read_losses.pkl' if not no_resampling else None, + '.autoPyTorch/true_targets_ensemble.npy' if not no_resampling else None, + ] if fn is not None ] for expected_file in expected_files: assert os.path.exists(os.path.join(tmp_dir, expected_file)) @@ -111,11 +79,16 @@ def _check_created_files(estimator): def _check_internal_dataset_settings(estimator, resampling_strategy, task_type: str): assert estimator.dataset.task_type == task_type - expected_num_splits = HOLDOUT_NUM_SPLITS if resampling_strategy == HoldoutValTypes.holdout_validation \ - else CV_NUM_SPLITS assert estimator.resampling_strategy == resampling_strategy assert estimator.dataset.resampling_strategy == resampling_strategy - assert len(estimator.dataset.splits) == expected_num_splits + + if isinstance(resampling_strategy, NoResamplingStrategyTypes): + if resampling_strategy == HoldoutValTypes.holdout_validation: + assert len(estimator.dataset.splits) == HOLDOUT_NUM_SPLITS + elif resampling_strategy == CrossValTypes.k_fold_cross_validation: + assert len(estimator.dataset.splits) == CV_NUM_SPLITS + else: + assert len(estimator.dataset.splits) == 1 # no resampling ==> no split, i.e. 1 def _check_smac_success(estimator, n_successful_runs: int = 1): @@ -150,6 +123,10 @@ def _check_model_file(estimator, resampling_strategy, run_key, run_key_model_run assert os.path.exists(model_file), model_file model = estimator._backend.load_model_by_seed_and_id_and_budget( estimator.seed, successful_num_run, run_key.budget) + elif resampling_strategy == NoResamplingStrategyTypes.no_resampling: + model_file = os.path.join(run_key_model_run_dir, + f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model") + assert os.path.exists(model_file), model_file elif resampling_strategy == CrossValTypes.k_fold_cross_validation: model_file = os.path.join( run_key_model_run_dir, @@ -169,8 +146,6 @@ def _check_model_file(estimator, resampling_strategy, run_key, run_key_model_run else: pytest.fail(resampling_strategy) - return model - def _check_test_prediction(estimator, X_test, y_test, run_key, run_key_model_run_dir, successful_num_run): test_prediction = os.path.join(run_key_model_run_dir, @@ -231,39 +206,6 @@ def _check_incumbent(estimator, successful_num_run): successful_num_run) assert 'train_loss' in incumbent_results - # Check that we can pickle - dump_file = os.path.join(estimator._backend.temporary_directory, 'dump.pkl') - - with open(dump_file, 'wb') as f: - pickle.dump(estimator, f) - - with open(dump_file, 'rb') as f: - restored_estimator = pickle.load(f) - restored_estimator.predict(X_test) - - # Test refit on dummy data - estimator.refit(dataset=backend.load_datamanager()) - - # Make sure that a configuration space is stored in the estimator - assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) - - -@pytest.mark.parametrize('openml_name', ("boston", )) -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', - new=dummy_eval_train_function) -@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', - ((HoldoutValTypes.holdout_validation, None), - (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}) - )) -def test_tabular_regression(openml_name, resampling_strategy, backend, resampling_strategy_args, n_samples): - - # Get the data and check that contents of data-manager make sense - X, y = sklearn.datasets.fetch_openml( - openml_name, - return_X_y=True, - as_frame=True - ) - X, y = X.iloc[:n_samples], y.iloc[:n_samples] def _get_estimator( backend, @@ -280,21 +222,27 @@ def _get_estimator( **kwargs ): + is_no_resample = isinstance(resampling_strategy, NoResamplingStrategyTypes) + # No resampling strategy must have ensemble_size == 0 + cls_kwargs = {key: 0 for key in ['ensemble_size'] if is_no_resample} # Search for a good configuration estimator = task_class( backend=backend, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=42, + **cls_kwargs ) + # train size: 225, test size: 75 ==> 300 / 225 = 1.3333... + mul_factor = 1.35 if is_no_resample else 1.0 # increase time for no resample with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction): estimator.search( X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, optimize_metric=metric, - total_walltime_limit=total_walltime_limit, - func_eval_time_limit_secs=func_eval_time_limit_secs, + total_walltime_limit=total_walltime_limit * mul_factor, + func_eval_time_limit_secs=func_eval_time_limit_secs * mul_factor, enable_traditional_pipeline=False, **kwargs ) @@ -303,15 +251,24 @@ def _get_estimator( def _check_tabular_task(estimator, X_test, y_test, task_type, resampling_strategy, n_successful_runs): + no_resampling = isinstance(resampling_strategy, NoResamplingStrategyTypes) + _check_internal_dataset_settings(estimator, resampling_strategy, task_type=task_type) - _check_created_files(estimator) + _check_created_files(estimator, no_resampling) run_key_model_run_dir, run_key, successful_num_run = _check_smac_success(estimator, n_successful_runs=n_successful_runs) _check_model_file(estimator, resampling_strategy, run_key, run_key_model_run_dir, successful_num_run) _check_test_prediction(estimator, X_test, y_test, run_key, run_key_model_run_dir, successful_num_run) - _check_ensemble_prediction(estimator, run_key, run_key_model_run_dir, successful_num_run) + + if not no_resampling: + _check_ensemble_prediction(estimator, run_key, run_key_model_run_dir, successful_num_run) + _check_incumbent(estimator, successful_num_run) + if no_resampling: + # no ensemble for no resampling, so early-return + return + # Test refit on dummy data # This process yields a mysterious bug after _check_picklable # However, we can process it in the _check_picklable function. @@ -329,14 +286,16 @@ def _check_tabular_task(estimator, X_test, y_test, task_type, resampling_strateg # Test # ==== -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_fn', +@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_fn', new=dummy_eval_fn) @pytest.mark.parametrize('openml_id', (40981, )) @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', ((HoldoutValTypes.holdout_validation, None), - (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}) + (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}), + (NoResamplingStrategyTypes.no_resampling, None) )) def test_tabular_classification(openml_id, resampling_strategy, backend, resampling_strategy_args, n_samples): + """NOTE: Check DummyEvaluator if something wrong""" X_train, X_test, y_train, y_test = _get_dataset(openml_id, n_samples, seed=42) estimator = _get_estimator( @@ -352,13 +311,15 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl @pytest.mark.parametrize('openml_id', (531, )) -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_fn', +@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_fn', new=dummy_eval_fn) @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', ((HoldoutValTypes.holdout_validation, None), - (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}) + (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}), + (NoResamplingStrategyTypes.no_resampling, None) )) def test_tabular_regression(openml_id, resampling_strategy, backend, resampling_strategy_args, n_samples): + """NOTE: Check DummyEvaluator if something wrong""" X, y = _get_dataset(openml_id, n_samples, split=False) # normalize values @@ -449,7 +410,7 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): estimator._all_supported_metrics = False with pytest.raises(ValueError, match=r".*Dummy prediction failed with run state.*"): - with unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') as dummy: + with unittest.mock.patch('autoPyTorch.evaluation.tae.eval_fn') as dummy: dummy.side_effect = MemoryError estimator._do_dummy_prediction() @@ -475,8 +436,8 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): del estimator -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', - new=dummy_eval_train_function) +@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_fn', + new=dummy_eval_fn) @pytest.mark.parametrize('openml_id', (40981, )) def test_portfolio_selection(openml_id, backend, n_samples): @@ -501,8 +462,8 @@ def test_portfolio_selection(openml_id, backend, n_samples): assert any(successful_config in portfolio_configs for successful_config in successful_configs) -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', - new=dummy_eval_train_function) +@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_fn', + new=dummy_eval_fn) @pytest.mark.parametrize('openml_id', (40981, )) def test_portfolio_selection_failure(openml_id, backend, n_samples): @@ -755,140 +716,3 @@ def test_pipeline_fit_error( assert 'TIMEOUT' in str(run_value.status) assert pipeline is None - - -@pytest.mark.parametrize('openml_id', (40981, )) -def test_tabular_classification_test_evaluator(openml_id, backend, n_samples): - - # Get the data and check that contents of data-manager make sense - X, y = sklearn.datasets.fetch_openml( - data_id=int(openml_id), - return_X_y=True, as_frame=True - ) - X, y = X.iloc[:n_samples], y.iloc[:n_samples] - - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=42) - - # Search for a good configuration - estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=NoResamplingStrategyTypes.no_resampling, - seed=42, - ensemble_size=0 - ) - - with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction): - estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=50, - func_eval_time_limit_secs=20, - enable_traditional_pipeline=False, - ) - - # Internal dataset has expected settings - assert estimator.dataset.task_type == 'tabular_classification' - - assert estimator.resampling_strategy == NoResamplingStrategyTypes.no_resampling - assert estimator.dataset.resampling_strategy == NoResamplingStrategyTypes.no_resampling - # Check for the created files - tmp_dir = estimator._backend.temporary_directory - loaded_datamanager = estimator._backend.load_datamanager() - assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors) - - expected_files = [ - 'smac3-output/run_42/configspace.json', - 'smac3-output/run_42/runhistory.json', - 'smac3-output/run_42/scenario.txt', - 'smac3-output/run_42/stats.json', - 'smac3-output/run_42/train_insts.txt', - 'smac3-output/run_42/trajectory.json', - '.autoPyTorch/datamanager.pkl', - '.autoPyTorch/start_time_42', - ] - for expected_file in expected_files: - assert os.path.exists(os.path.join(tmp_dir, expected_file)), "{}/{}/{}".format( - tmp_dir, - [data for data in pathlib.Path(tmp_dir).glob('*')], - expected_file, - ) - - # Check that smac was able to find proper models - succesful_runs = [run_value.status for run_value in estimator.run_history.data.values( - ) if 'SUCCESS' in str(run_value.status)] - assert len(succesful_runs) > 1, [(k, v) for k, v in estimator.run_history.data.items()] - - # Search for an existing run key in disc. A individual model might have - # a timeout and hence was not written to disc - successful_num_run = None - SUCCESS = False - for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if 'SUCCESS' in str(value.status): - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id + 1, run_key.budget) - successful_num_run = run_key.config_id + 1 - if os.path.exists(run_key_model_run_dir): - # Runkey config id is different from the num_run - # more specifically num_run = config_id + 1(dummy) - SUCCESS = True - break - - assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}" - - model_file = os.path.join(run_key_model_run_dir, - f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model") - assert os.path.exists(model_file), model_file - - # Make sure that predictions on the test data are printed and make sense - test_prediction = os.path.join(run_key_model_run_dir, - estimator._backend.get_prediction_filename( - 'test', estimator.seed, successful_num_run, - run_key.budget)) - assert os.path.exists(test_prediction), test_prediction - assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == np.shape(X_test)[0] - - y_pred = estimator.predict(X_test) - assert np.shape(y_pred)[0] == np.shape(X_test)[0] - - # Make sure that predict proba has the expected shape - probabilites = estimator.predict_proba(X_test) - assert np.shape(probabilites) == (np.shape(X_test)[0], 2) - - score = estimator.score(y_pred, y_test) - assert 'accuracy' in score - - # check incumbent config and results - incumbent_config, incumbent_results = estimator.get_incumbent_results() - assert isinstance(incumbent_config, Configuration) - assert isinstance(incumbent_results, dict) - assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data, - successful_num_run) - assert 'train_loss' in incumbent_results - - -@pytest.mark.parametrize("ans,task_class", ( - ("continuous", TabularRegressionTask), - ("multiclass", TabularClassificationTask)) -) -def test_task_inference(ans, task_class, backend): - # Get the data and check that contents of data-manager make sense - X = np.random.random((6, 1)) - y = np.array([-10 ** 12, 0, 1, 2, 3, 4], dtype=np.int64) + 10 ** 12 - - estimator = task_class( - backend=backend, - resampling_strategy=HoldoutValTypes.holdout_validation, - resampling_strategy_args=None, - seed=42, - ) - dataset = estimator.get_dataset(X, y) - assert dataset.output_type == ans - - y += 10 ** 12 + 10 # Check if the function catches overflow possibilities - if ans == 'continuous': - with pytest.raises(ValueError): # ValueError due to `Too large value` - estimator.get_dataset(X, y) - else: - estimator.get_dataset(X, y) diff --git a/test/test_api/utils.py b/test/test_api/utils.py index 0e757015d..45b5af562 100644 --- a/test/test_api/utils.py +++ b/test/test_api/utils.py @@ -4,11 +4,11 @@ from autoPyTorch.constants import REGRESSION_TASKS from autoPyTorch.evaluation.abstract_evaluator import fit_pipeline +from autoPyTorch.evaluation.evaluator import Evaluator from autoPyTorch.evaluation.pipeline_class_collection import ( DummyClassificationPipeline, DummyRegressionPipeline ) -from autoPyTorch.evaluation.train_evaluator import TrainEvaluator from autoPyTorch.pipeline.traditional_tabular_classification import TraditionalTabularClassificationPipeline from autoPyTorch.utils.common import subsampler @@ -28,7 +28,7 @@ def dummy_traditional_classification(self, time_left: int, func_eval_time_limit_ # ======== # Fixtures # ======== -class DummyTrainEvaluator(TrainEvaluator): +class DummyEvaluator(Evaluator): def _get_pipeline(self): if self.task_type in REGRESSION_TASKS: pipeline = DummyRegressionPipeline(config=1) @@ -44,37 +44,21 @@ def _fit_and_evaluate_loss(self, pipeline, split_id, train_indices, opt_indices) self.logger.info("Model fitted, now predicting") kwargs = {'pipeline': pipeline, 'unique_train_labels': self.unique_train_labels[split_id]} + train_pred = self.predict(subsampler(self.X_train, train_indices), **kwargs) - opt_pred = self.predict(subsampler(self.X_train, opt_indices), **kwargs) - valid_pred = self.predict(self.X_valid, **kwargs) test_pred = self.predict(self.X_test, **kwargs) + valid_pred = self.predict(self.X_valid, **kwargs) + + # No resampling ===> evaluate on test dataset + opt_pred = self.predict(subsampler(self.X_train, opt_indices), **kwargs) if self.train else test_pred assert train_pred is not None and opt_pred is not None # mypy check return train_pred, opt_pred, valid_pred, test_pred # create closure for evaluating an algorithm -def dummy_eval_train_function( - backend, - queue, - metric, - budget: float, - config, - seed: int, - output_y_hat_optimization: bool, - num_run: int, - include, - exclude, - disable_file_output, - pipeline_config=None, - budget_type=None, - init_params=None, - logger_port=None, - all_supported_metrics=True, - search_space_updates=None, - instance: str = None, -) -> None: - evaluator = DummyTrainEvaluator( +def dummy_eval_fn(queue, fixed_pipeline_params, evaluator_params): + evaluator = DummyEvaluator( queue=queue, fixed_pipeline_params=fixed_pipeline_params, evaluator_params=evaluator_params diff --git a/test/test_evaluation/test_evaluators.py b/test/test_evaluation/test_evaluators.py index aae259e08..449bf8d4a 100644 --- a/test/test_evaluation/test_evaluators.py +++ b/test/test_evaluation/test_evaluators.py @@ -18,8 +18,11 @@ from autoPyTorch.automl_common.common.utils.backend import create from autoPyTorch.datasets.resampling_strategy import CrossValTypes, NoResamplingStrategyTypes -from autoPyTorch.evaluation.test_evaluator import TestEvaluator -from autoPyTorch.evaluation.train_evaluator import TrainEvaluator +from autoPyTorch.evaluation.abstract_evaluator import EvaluatorParams, FixedPipelineParams +from autoPyTorch.evaluation.evaluator import ( + Evaluator, + _CrossValidationResultsManager, +) from autoPyTorch.evaluation.utils import read_queue from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy @@ -98,7 +101,7 @@ def test_merge_predictions(self): assert np.allclose(ans, cv_results._merge_predictions(preds)) -class TestTrainEvaluator(BaseEvaluatorTest, unittest.TestCase): +class TestEvaluator(BaseEvaluatorTest, unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): @@ -146,7 +149,7 @@ def test_evaluate_loss(self): backend_api.load_datamanager = lambda: D fixed_params_dict = self.fixed_params._asdict() fixed_params_dict.update(backend=backend_api) - evaluator = TrainEvaluator( + evaluator = Evaluator( queue=multiprocessing.Queue(), fixed_pipeline_params=FixedPipelineParams(**fixed_params_dict), evaluator_params=self.eval_params @@ -171,7 +174,7 @@ def test_holdout(self, pipeline_mock): fixed_params_dict = self.fixed_params._asdict() fixed_params_dict.update(backend=backend_api) - evaluator = TrainEvaluator( + evaluator = Evaluator( queue=_queue, fixed_pipeline_params=FixedPipelineParams(**fixed_params_dict), evaluator_params=self.eval_params @@ -213,7 +216,7 @@ def test_cv(self, pipeline_mock): fixed_params_dict = self.fixed_params._asdict() fixed_params_dict.update(backend=backend_api) - evaluator = TrainEvaluator( + evaluator = Evaluator( queue=_queue, fixed_pipeline_params=FixedPipelineParams(**fixed_params_dict), evaluator_params=self.eval_params @@ -232,7 +235,7 @@ def test_cv(self, pipeline_mock): self.assertEqual(evaluator._save_to_backend.call_count, 1) self.assertEqual(result, 0.463768115942029) self.assertEqual(pipeline_mock.fit.call_count, 5) - # 9 calls because of the training, holdout and + # 15 calls because of the training, holdout and # test set (3 sets x 5 folds = 15) self.assertEqual(pipeline_mock.predict_proba.call_count, 15) call_args = evaluator._save_to_backend.call_args @@ -246,44 +249,93 @@ def test_cv(self, pipeline_mock): self.assertEqual(call_args[0][2].shape[0], D.test_tensors[1].shape[0]) - @unittest.mock.patch.object(TrainEvaluator, '_loss') - def test_save_to_backend(self, loss_mock): - D = get_regression_datamanager() - D.name = 'test' - self.backend_mock.load_datamanager.return_value = D + @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') + def test_no_resampling(self, pipeline_mock): + pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 10} + # Binary iris, contains 69 train samples, 31 test samples + D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) + pipeline_mock.predict_proba.side_effect = \ + lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.side_effect = lambda **kwargs: pipeline_mock + pipeline_mock.get_additional_run_info.return_value = None + _queue = multiprocessing.Queue() - loss_mock.return_value = None + backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch') + backend_api.load_datamanager = lambda: D - evaluator = TrainEvaluator( + fixed_params_dict = self.fixed_params._asdict() + fixed_params_dict.update(backend=backend_api) + evaluator = Evaluator( queue=_queue, - fixed_pipeline_params=self.fixed_params, + fixed_pipeline_params=FixedPipelineParams(**fixed_params_dict), evaluator_params=self.eval_params ) - evaluator.y_opt = D.train_tensors[1] - key_ans = {'seed', 'idx', 'budget', 'model', 'cv_model', - 'ensemble_predictions', 'valid_predictions', 'test_predictions'} - - for cnt, pl in enumerate([['model'], ['model2', 'model2']], start=1): - self.backend_mock.get_model_dir.return_value = True - evaluator.pipelines = pl - self.assertTrue(evaluator._save_to_backend(D.train_tensors[1], None, D.test_tensors[1])) - call_list = self.backend_mock.save_numrun_to_dir.call_args_list[-1][1] - - self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, cnt) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, cnt) - self.assertEqual(call_list.keys(), key_ans) - self.assertIsNotNone(call_list['model']) - if len(pl) > 1: # ==> cross validation - # self.assertIsNotNone(call_list['cv_model']) - # TODO: Reflect the ravin's opinion - pass - else: # holdout ==> single thus no cv_model - self.assertIsNone(call_list['cv_model']) - - # Check for not containing NaNs - that the models don't predict nonsense - # for unseen data - D.train_tensors[1][0] = np.NaN - self.assertFalse(evaluator._save_to_backend(D.train_tensors[1], None, D.test_tensors[1])) + evaluator._save_to_backend = unittest.mock.Mock(spec=evaluator._save_to_backend) + evaluator._save_to_backend.return_value = True + + evaluator.evaluate_loss() + + rval = read_queue(evaluator.queue) + self.assertEqual(len(rval), 1) + result = rval[0]['loss'] + self.assertEqual(len(rval[0]), 3) + self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) + + self.assertEqual(evaluator._save_to_backend.call_count, 1) + self.assertEqual(result, 0.5806451612903225) + self.assertEqual(pipeline_mock.fit.call_count, 1) + # 2 calls because of train and test set + self.assertEqual(pipeline_mock.predict_proba.call_count, 2) + call_args = evaluator._save_to_backend.call_args + self.assertIsNone(D.splits[0][1]) + self.assertIsNone(call_args[0][1]) + self.assertEqual(call_args[0][2].shape[0], D.test_tensors[1].shape[0]) + self.assertEqual(evaluator.pipelines[0].fit.call_count, 1) + + @unittest.mock.patch.object(Evaluator, '_loss') + def test_save_to_backend(self, loss_mock): + call_counter = 0 + no_resample_counter = 0 + for rs in [None, NoResamplingStrategyTypes.no_resampling]: + no_resampling = isinstance(rs, NoResamplingStrategyTypes) + D = get_regression_datamanager() if rs is None else get_regression_datamanager(rs) + D.name = 'test' + self.backend_mock.load_datamanager.return_value = D + _queue = multiprocessing.Queue() + loss_mock.return_value = None + + evaluator = Evaluator( + queue=_queue, + fixed_pipeline_params=self.fixed_params, + evaluator_params=self.eval_params + ) + evaluator.y_opt = D.train_tensors[1] + key_ans = {'seed', 'idx', 'budget', 'model', 'cv_model', + 'ensemble_predictions', 'valid_predictions', 'test_predictions'} + + for pl in [['model'], ['model2', 'model2']]: + call_counter += 1 + no_resample_counter += no_resampling + self.backend_mock.get_model_dir.return_value = True + evaluator.pipelines = pl + self.assertTrue(evaluator._save_to_backend(D.train_tensors[1], None, D.test_tensors[1])) + call_list = self.backend_mock.save_numrun_to_dir.call_args_list[-1][1] + + self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, call_counter - no_resample_counter) + self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, call_counter) + self.assertEqual(call_list.keys(), key_ans) + self.assertIsNotNone(call_list['model']) + if len(pl) > 1: # ==> cross validation + # self.assertIsNotNone(call_list['cv_model']) + # TODO: Reflect the ravin's opinion + pass + else: # holdout ==> single thus no cv_model + self.assertIsNone(call_list['cv_model']) + + # Check for not containing NaNs - that the models don't predict nonsense + # for unseen data + D.train_tensors[1][0] = np.NaN + self.assertFalse(evaluator._save_to_backend(D.train_tensors[1], None, D.test_tensors[1])) @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') def test_predict_proba_binary_classification(self, mock): @@ -296,7 +348,7 @@ def test_predict_proba_binary_classification(self, mock): _queue = multiprocessing.Queue() - evaluator = TrainEvaluator( + evaluator = Evaluator( queue=_queue, fixed_pipeline_params=self.fixed_params, evaluator_params=self.eval_params @@ -309,6 +361,34 @@ def test_predict_proba_binary_classification(self, mock): for i in range(7): self.assertEqual(0.9, Y_optimization_pred[i][1]) + @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') + def test_predict_proba_binary_classification_no_resampling(self, mock): + D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) + self.backend_mock.load_datamanager.return_value = D + mock.predict_proba.side_effect = lambda y, batch_size=None: np.array( + [[0.1, 0.9]] * y.shape[0] + ) + mock.side_effect = lambda **kwargs: mock + backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch') + backend_api.load_datamanager = lambda: D + + fixed_params_dict = self.fixed_params._asdict() + fixed_params_dict.update(backend=backend_api) + + _queue = multiprocessing.Queue() + + evaluator = Evaluator( + queue=_queue, + fixed_pipeline_params=self.fixed_params, + evaluator_params=self.eval_params + ) + evaluator.evaluate_loss() + Y_test_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][-1][ + 'ensemble_predictions'] + + for i in range(7): + self.assertEqual(0.9, Y_test_pred[i][1]) + def test_get_results(self): _queue = multiprocessing.Queue() for i in range(5): @@ -334,7 +414,7 @@ def test_additional_metrics_during_training(self, pipeline_mock): fixed_params_dict = self.fixed_params._asdict() fixed_params_dict.update(backend=backend_api) - evaluator = TrainEvaluator( + evaluator = Evaluator( queue=_queue, fixed_pipeline_params=FixedPipelineParams(**fixed_params_dict), evaluator_params=self.eval_params @@ -350,155 +430,3 @@ def test_additional_metrics_during_training(self, pipeline_mock): self.assertIn('additional_run_info', result) self.assertIn('opt_loss', result['additional_run_info']) self.assertGreater(len(result['additional_run_info']['opt_loss'].keys()), 1) - - -class TestTestEvaluator(BaseEvaluatorTest, unittest.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - """ - Creates a backend mock - """ - tmp_dir_name = self.id() - self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name) - if os.path.exists(self.ev_path): - shutil.rmtree(self.ev_path) - os.makedirs(self.ev_path, exist_ok=False) - dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] - dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)] - dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)] - backend_mock = unittest.mock.Mock() - backend_mock.get_model_dir.return_value = self.ev_path - backend_mock.get_cv_model_dir.return_value = self.ev_path - backend_mock.get_model_path.side_effect = dummy_model_files - backend_mock.get_cv_model_path.side_effect = dummy_cv_model_files - backend_mock.get_prediction_output_path.side_effect = dummy_pred_files - backend_mock.temporary_directory = self.ev_path - self.backend_mock = backend_mock - - self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir') - self.output_dir = os.path.join(self.ev_path, 'out_dir') - - def tearDown(self): - if os.path.exists(self.ev_path): - shutil.rmtree(self.ev_path) - - @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') - def test_no_resampling(self, pipeline_mock): - # Binary iris, contains 69 train samples, 31 test samples - D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) - pipeline_mock.side_effect = lambda **kwargs: pipeline_mock - pipeline_mock.get_additional_run_info.return_value = None - pipeline_mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} - - configuration = unittest.mock.Mock(spec=Configuration) - backend_api = create(self.tmp_dir, self.output_dir, 'autoPyTorch') - backend_api.load_datamanager = lambda: D - queue_ = multiprocessing.Queue() - - evaluator = TestEvaluator(backend_api, queue_, configuration=configuration, metric=accuracy, budget=0) - evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) - evaluator.file_output.return_value = (None, {}) - - evaluator.fit_predict_and_loss() - - rval = read_queue(evaluator.queue) - self.assertEqual(len(rval), 1) - result = rval[0]['loss'] - self.assertEqual(len(rval[0]), 3) - self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) - - self.assertEqual(evaluator.file_output.call_count, 1) - self.assertEqual(result, 0.5806451612903225) - self.assertEqual(pipeline_mock.fit.call_count, 1) - # 2 calls because of train and test set - self.assertEqual(pipeline_mock.predict_proba.call_count, 2) - self.assertEqual(evaluator.file_output.call_count, 1) - # Should be none as no val preds are mentioned - self.assertIsNone(evaluator.file_output.call_args[0][1]) - # Number of y_test_preds and Y_test should be the same - self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], - D.test_tensors[1].shape[0]) - self.assertEqual(evaluator.pipeline.fit.call_count, 1) - - @unittest.mock.patch.object(TestEvaluator, '_loss') - def test_file_output(self, loss_mock): - - D = get_regression_datamanager(NoResamplingStrategyTypes.no_resampling) - D.name = 'test' - self.backend_mock.load_datamanager.return_value = D - configuration = unittest.mock.Mock(spec=Configuration) - queue_ = multiprocessing.Queue() - loss_mock.return_value = None - - evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) - - self.backend_mock.get_model_dir.return_value = True - evaluator.pipeline = 'model' - evaluator.Y_optimization = D.train_tensors[1] - rval = evaluator.file_output( - D.train_tensors[1], - None, - D.test_tensors[1], - ) - - self.assertEqual(rval, (None, {})) - # These targets are not saved as Fit evaluator is not used to make an ensemble - self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 0) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), - {'seed', 'idx', 'budget', 'model', 'cv_model', - 'ensemble_predictions', 'valid_predictions', 'test_predictions'}) - self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) - self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) - - # Check for not containing NaNs - that the models don't predict nonsense - # for unseen data - D.test_tensors[1][0] = np.NaN - rval = evaluator.file_output( - D.train_tensors[1], - None, - D.test_tensors[1], - ) - self.assertEqual( - rval, - ( - 1.0, - { - 'error': - 'Model predictions for test set contains NaNs.' - }, - ) - ) - - @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') - def test_predict_proba_binary_classification(self, mock): - D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) - self.backend_mock.load_datamanager.return_value = D - mock.predict_proba.side_effect = lambda y, batch_size=None: np.array( - [[0.1, 0.9]] * y.shape[0] - ) - mock.side_effect = lambda **kwargs: mock - mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} - configuration = unittest.mock.Mock(spec=Configuration) - queue_ = multiprocessing.Queue() - - evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) - - evaluator.fit_predict_and_loss() - Y_test_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][-1][ - 'ensemble_predictions'] - - for i in range(7): - self.assertEqual(0.9, Y_test_pred[i][1]) - - def test_get_results(self): - queue_ = multiprocessing.Queue() - for i in range(5): - queue_.put((i * 1, 1 - (i * 0.2), 0, "", StatusType.SUCCESS)) - result = read_queue(queue_) - self.assertEqual(len(result), 5) - self.assertEqual(result[0][0], 0) - self.assertAlmostEqual(result[0][1], 1.0) diff --git a/test/test_evaluation/test_tae.py b/test/test_evaluation/test_tae.py index 351e7b633..0a187f6c2 100644 --- a/test/test_evaluation/test_tae.py +++ b/test/test_evaluation/test_tae.py @@ -102,7 +102,7 @@ def test_check_run_info(self): run_info = unittest.mock.Mock() run_info.budget = -1 with pytest.raises(ValueError): - taq._check_run_info(run_info) + taq.run_wrapper(run_info) def test_cutoff_update_in_run_wrapper(self): taq = _create_taq() diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index adfe3241b..ce1579caa 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -519,3 +519,16 @@ def test_train_pipeline_with_runtime_max_reached(fit_dictionary_tabular_dummy): patch.is_max_time_reached.return_value = True with pytest.raises(RuntimeError): pipeline.fit(fit_dictionary_tabular_dummy) + + +def test_get_pipeline_representation(): + pipeline = TabularClassificationPipeline( + dataset_properties={ + 'numerical_columns': None, + 'categorical_columns': None, + 'task_type': 'tabular_classification' + } + ) + repr = pipeline.get_pipeline_representation() + assert isinstance(repr, dict) + assert all(word in repr for word in ['Preprocessing', 'Estimator'])