From 497f8f78cc27036b87a4202cab1629f04ece427a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 21 Dec 2021 19:19:15 +0100 Subject: [PATCH 1/6] add component with working example --- autoPyTorch/api/base_task.py | 358 +++++++++++++++--- .../example_posthoc_ensemble_fit.py | 82 ++++ 2 files changed, 378 insertions(+), 62 deletions(-) create mode 100644 examples/40_advanced/example_posthoc_ensemble_fit.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c5468eae7..96cf254a6 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -717,6 +717,23 @@ def _load_best_individual_model(self) -> SingleBest: return ensemble + def _cleanup(self) -> None: + """ + Closes the different servers created during api search. + Returns: + None + """ + if hasattr(self, '_logger') and self._logger is not None: + self._logger.info("Closing the dask infrastructure") + self._close_dask_client() + self._logger.info("Finished closing the dask infrastructure") + + # Clean up the logger + self._logger.info("Starting to clean up the logger") + self._clean_logger() + else: + self._close_dask_client() + def _do_dummy_prediction(self) -> None: assert self._metric is not None @@ -914,6 +931,35 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: save_external=True) return + def run_traditional_ml( + self, + current_task_name: str, + runtime_limit: int, + func_eval_time_limit_secs: int + ) -> None: + """ + This function can be used to run the suite of traditional machine + learning models during the current task (for e.g, ensemble fit, search) + + Args: + current_task_name (str): name of the current task, + runtime_limit (int): time limit for fitting traditional models, + func_eval_time_limit_secs (int): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. + """ + assert self._logger is not None # for mypy compliancy + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(current_task_name) + time_for_traditional = int(runtime_limit - elapsed_time) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) + def _search( self, optimize_metric: str, @@ -1182,28 +1228,25 @@ def _search( ) # ============> Run dummy predictions - dummy_task_name = 'runDummy' - self._stopwatch.start_task(dummy_task_name) - self._do_dummy_prediction() - self._stopwatch.stop_task(dummy_task_name) + # We only want to run dummy predictions in case we want to build an ensemble + if self.ensemble_size > 0: + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) # ============> Run traditional ml - - if enable_traditional_pipeline: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - # We want time for at least 1 Neural network in SMAC - time_for_traditional = int( - self._time_for_task - elapsed_time - func_eval_time_limit_secs - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + # We only want to run traditional predictions in case we want to build an ensemble + # We want time for at least 1 Neural network in SMAC + if enable_traditional_pipeline and self.ensemble_size > 0: + traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) + self.run_traditional_ml(current_task_name=self.dataset_name, + runtime_limit=traditional_runtime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs) # ============> Starting ensemble + self.precision = precision + self.opt_metric = optimize_metric elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) proc_ensemble = None @@ -1220,28 +1263,12 @@ def _search( self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' self._stopwatch.start_task(ensemble_task_name) - proc_ensemble = EnsembleBuilderManager( - start_time=time.time(), - time_left_for_ensembles=time_left_for_ensembles, - backend=copy.deepcopy(self._backend), - dataset_name=str(dataset.dataset_name), - output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type], - task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric], - opt_metric=optimize_metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - max_iterations=None, - read_at_most=sys.maxsize, - ensemble_memory_limit=self._memory_limit, - random_state=self.seed, - precision=precision, - logger_port=self._logger_port, - pynisher_context=self._multiprocessing_context, - metrics_kwargs=self._metrics_kwargs, - ) + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric + ) self._stopwatch.stop_task(ensemble_task_name) # ==> Run SMAC @@ -1311,22 +1338,8 @@ def _search( self._logger.info("Starting Shutdown") if proc_ensemble is not None: - self._results_manager.ensemble_performance_history = list(proc_ensemble.history) - - if len(proc_ensemble.futures) > 0: - # Also add ensemble runs that did not finish within smac time - # and add them into the ensemble history - self._logger.info("Ensemble script still running, waiting for it to finish.") - result = proc_ensemble.futures.pop().result() - if result: - ensemble_history, _, _, _ = result - self._results_manager.ensemble_performance_history.extend(ensemble_history) - self._logger.info("Ensemble script finished, continue shutdown.") - - # save the ensemble performance history file - if len(self.ensemble_performance_history) > 0: - pd.DataFrame(self.ensemble_performance_history).to_json( - os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + self._collect_results_ensemble(proc_ensemble) + self._logger.info("Closing the dask infrastructure") self._close_dask_client() @@ -1337,9 +1350,7 @@ def _search( self._load_models() self._logger.info("Finished loading models...") - # Clean up the logger - self._logger.info("Starting to clean up the logger") - self._clean_logger() + self._cleanup() return self @@ -1440,7 +1451,7 @@ def refit( split_id=split_id) fit_and_suppress_warnings(self._logger, model, X, y=None) - self._clean_logger() + self._cleanup() return self @@ -1688,7 +1699,7 @@ def fit_pipeline( disable_file_output=disable_file_output ) - self._clean_logger() + self._cleanup() return fitted_pipeline, run_info, run_value, dataset @@ -1723,6 +1734,229 @@ def _get_fitted_pipeline( budget=float(run_info.budget), ) + def fit_ensemble( + self, + optimize_metric: Optional[str] = None, + precision: Optional[int] = None, + ensemble_nbest: int = 50, + ensemble_size: int = 50, + load_models: bool = True, + time_for_task: int = 100, + func_eval_time_limit_secs: int = 50, + enable_traditional_pipeline: bool = True, + ) -> 'BaseTask': + """ + Enables post-hoc fitting of the ensemble after the `search()` + method is finished. This method creates an ensemble using all + the models stored on disk during the smbo run. + + Args: + optimize_metric (str): name of the metric that is used to + evaluate a pipeline. if not specified, value passed to search will be used + precision (Optional[int]): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + ensemble_nbest (Optional[int]): + only consider the ensemble_nbest models to build the ensemble. + If None, uses the value stored in class attribute `ensemble_nbest`. + ensemble_size (int) (default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + enable_traditional_pipeline (bool), (default=True): + We fit traditional machine learning algorithms + (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) + prior building PyTorch Neural Networks. You can disable this + feature by turning this flag to False. All machine learning + algorithms that are fitted during search() are considered for + ensemble building. + load_models (bool), (default=True): Whether to load the + models after fitting AutoPyTorch. + time_for_task (int), (default=100): Time limit + in seconds for the search of appropriate models. + By increasing this value, autopytorch has a higher + chance of finding better models. + func_eval_time_limit_secs (int), (default=None): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. Set + this value high enough so that typical machine + learning algorithms can be fit on the training + data. + When set to None, this time will automatically be set to + total_walltime_limit // 2 to allow enough time to fit + at least 2 individual machine learning algorithms. + Set to np.inf in case no time limit is desired. + + Returns: + self + """ + # Make sure that input is valid + if self.dataset is None or self.opt_metric is None: + raise ValueError("fit_ensemble() can only be called after `search()`. " + "Please call the `search()` method of {} prior to " + "fit_ensemble().".format(self.__class__.__name__)) + + precision = precision if precision is not None else self.precision + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) + + if self._logger is None: + self._logger = self._get_logger(self.dataset.dataset_name) + + # Create a client if needed + if self._dask_client is None: + self._create_dask_client() + else: + self._is_dask_client_internally_created = False + + ensemble_fit_task_name = 'EnsembleFit' + self._stopwatch.start_task(ensemble_fit_task_name) + if enable_traditional_pipeline: + if func_eval_time_limit_secs > time_for_task: + self._logger.warning( + 'Time limit for a single run is higher than total time ' + 'limit. Capping the limit for a single run to the total ' + 'time given to Ensemble fit (%f)' % time_for_task + ) + func_eval_time_limit_secs = time_for_task + + # Make sure that at least 2 models are created for the ensemble process + num_models = time_for_task // func_eval_time_limit_secs + if num_models < 2: + func_eval_time_limit_secs = time_for_task // 2 + self._logger.warning( + "Capping the func_eval_time_limit_secs to {} to have " + "time for at least 2 models to ensemble.".format( + func_eval_time_limit_secs + ) + ) + # ============> Run Dummy predictions + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) + + # ============> Run traditional ml + if enable_traditional_pipeline: + self.run_traditional_ml(current_task_name=ensemble_fit_task_name, + runtime_limit=time_for_task, + func_eval_time_limit_secs=func_eval_time_limit_secs) + + elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) + time_left_for_ensemble = int(time_for_task - elapsed_time) + manager = self._init_ensemble_builder( + time_left_for_ensembles=time_left_for_ensemble, + optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric, + precision=precision, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + ) + + manager.build_ensemble(self._dask_client) + if manager is not None: + self._collect_results_ensemble(manager) + + if load_models: + self._load_models() + + self._stopwatch.stop_task(ensemble_fit_task_name) + + self._cleanup() + + return self + + def _init_ensemble_builder( + self, + time_left_for_ensembles: float, + optimize_metric: str, + ensemble_nbest: int, + ensemble_size: int, + precision: int = 32, + ) -> EnsembleBuilderManager: + """ + Initializes an `EnsembleBuilderManager`. + Args: + time_left_for_ensembles (float): + Time (in seconds) allocated to building the ensemble + optimize_metric (str): + Name of the metric to optimize the ensemble. + ensemble_nbest (int): + only consider the ensemble_nbest models to build the ensemble. + ensemble_size (int): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + + Returns: + EnsembleBuilderManager + """ + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + if self.dataset is None: + raise ValueError("ensemble can only be initialised after or during `search()`. " + "Please call the `search()` method of {}.".format(self.__class__.__name__)) + + self._logger.info("Starting ensemble") + ensemble_task_name = 'ensemble' + self._stopwatch.start_task(ensemble_task_name) + + # Use the current thread to start the ensemble builder process + # The function ensemble_builder_process will internally create a ensemble + # builder in the provide dask client + required_dataset_properties = {'task_type': self.task_type, + 'output_type': self.dataset.output_type} + proc_ensemble = EnsembleBuilderManager( + start_time=time.time(), + time_left_for_ensembles=time_left_for_ensembles, + backend=copy.deepcopy(self._backend), + dataset_name=str(self.dataset.dataset_name), + output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], + task_type=STRING_TO_TASK_TYPES[self.task_type], + metrics=[self._metric] if self._metric is not None else get_metrics( + dataset_properties=required_dataset_properties, names=[optimize_metric]), + opt_metric=optimize_metric, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=self.max_models_on_disc, + seed=self.seed, + max_iterations=None, + read_at_most=sys.maxsize, + ensemble_memory_limit=self._memory_limit, + random_state=self.seed, + precision=precision, + logger_port=self._logger_port, + ) + self._stopwatch.stop_task(ensemble_task_name) + + return proc_ensemble + + def _collect_results_ensemble( + self, + manager: EnsembleBuilderManager + ) -> None: + + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + + self._results_manager.ensemble_performance_history = list(manager.history) + + if len(manager.futures) > 0: + # Also add ensemble runs that did not finish within smac time + # and add them into the ensemble history + self._logger.info("Ensemble script still running, waiting for it to finish.") + result = manager.futures.pop().result() + if result: + ensemble_history, _, _, _ = result + self._results_manager.ensemble_performance_history.extend(ensemble_history) + self._logger.info("Ensemble script finished, continue shutdown.") + + # save the ensemble performance history file + if len(self.ensemble_performance_history) > 0: + pd.DataFrame(self.ensemble_performance_history).to_json( + os.path.join(self._backend.internals_directory, 'ensemble_history.json')) + def predict( self, X_test: np.ndarray, @@ -1774,7 +2008,7 @@ def predict( predictions = self.ensemble_.predict(all_predictions) - self._clean_logger() + self._cleanup() return predictions diff --git a/examples/40_advanced/example_posthoc_ensemble_fit.py b/examples/40_advanced/example_posthoc_ensemble_fit.py new file mode 100644 index 000000000..cb2482b73 --- /dev/null +++ b/examples/40_advanced/example_posthoc_ensemble_fit.py @@ -0,0 +1,82 @@ +""" +===================================================== +Tabular Classification with Post-Hoc Ensemble Fitting +===================================================== + +The following example shows how to fit a sample classification model +and create an ensemble post-hoc with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask + + +if __name__ == '__main__': + + ############################################################################ + # Data Loading + # ============ + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, + ) + + ############################################################################ + # Build and fit a classifier + # ========================== + api = TabularClassificationTask( + ensemble_size=0, + seed=42, + ) + + ############################################################################ + # Search for the best neural network + # ================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=100, + func_eval_time_limit_secs=50 + ) + + ############################################################################ + # Print the final performance of the incumbent neural network + # =========================================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + + ############################################################################ + # Fit an ensemble with the neural networks fitted during the search + # ================================================================= + + api.fit_ensemble(ensemble_size=5, + # Set the enable_traditional_pipeline=True + # to also include traditional models + # in the ensemble + enable_traditional_pipeline=False) + # Print the final ensemble built by AutoPyTorch + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + print(api.show_models()) + api._cleanup() \ No newline at end of file From 20514cb124aa242824380679ff2f1ed1ef0e6051 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 21 Dec 2021 22:28:50 +0100 Subject: [PATCH 2/6] add tests for ensemble init --- autoPyTorch/api/base_task.py | 5 +++-- test/test_api/test_base_api.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 96cf254a6..c936f9faf 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1914,8 +1914,9 @@ def _init_ensemble_builder( dataset_name=str(self.dataset.dataset_name), output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric] if self._metric is not None else get_metrics( - dataset_properties=required_dataset_properties, names=[optimize_metric]), + metrics=get_metrics( + dataset_properties=required_dataset_properties, + names=[optimize_metric]), opt_metric=optimize_metric, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index bb8f9c061..91e4d879d 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -12,8 +12,11 @@ from autoPyTorch.api.base_task import BaseTask, _pipeline_predict from autoPyTorch.constants import TABULAR_CLASSIFICATION, TABULAR_REGRESSION +from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes +from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline +from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy # ==== @@ -201,3 +204,35 @@ def test_pipeline_get_budget_forecasting(fit_dictionary_forecasting, min_budget, assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config assert list(smac_mock.call_args)[1]['max_budget'] == max_budget assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget + + +def test_init_ensemble_builder(backend): + BaseTask.__abstractmethods__ = set() + estimator = BaseTask( + backend=backend, + ensemble_size=0, + ) + + # Setup pre-requisites normally set by search() + estimator._logger = estimator._get_logger('test') + estimator.task_type = "tabular_classification" + estimator._memory_limit = 60 + estimator.dataset = MagicMock(spec=BaseDataset) + estimator.dataset.output_type = 'binary' + estimator.dataset.dataset_name = 'dummy' + + proc_ensemble = estimator._init_ensemble_builder( + time_left_for_ensembles=60, + optimize_metric='accuracy', + ensemble_nbest=10, + ensemble_size=5 + ) + + assert isinstance(proc_ensemble, EnsembleBuilderManager) + assert proc_ensemble.opt_metric == 'accuracy' + assert proc_ensemble.metrics[0] == accuracy + + estimator._close_dask_client() + estimator._clean_logger() + + del estimator From fd71cb7cd0493f68206703bebb21fe22fb49dfb6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 23 Dec 2021 12:35:44 +0100 Subject: [PATCH 3/6] move ensemble arguments to search function --- autoPyTorch/api/base_task.py | 76 +++++++++++++---------- autoPyTorch/api/tabular_classification.py | 36 +++++------ autoPyTorch/api/tabular_regression.py | 42 +++++++------ test/test_api/test_api.py | 1 - test/test_api/test_base_api.py | 5 +- 5 files changed, 87 insertions(+), 73 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c936f9faf..d9db1231c 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -128,16 +128,6 @@ class BaseTask(ABC): Number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. It also controls the size of - the ensemble as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -173,9 +163,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -195,9 +182,6 @@ def __init__( self.seed = seed self.n_jobs = n_jobs self.n_threads = n_threads - self.ensemble_size = ensemble_size - self.ensemble_nbest = ensemble_nbest - self.max_models_on_disc = max_models_on_disc self.logging_config: Optional[Dict] = logging_config self.include_components: Optional[Dict] = include_components self.exclude_components: Optional[Dict] = exclude_components @@ -980,6 +964,9 @@ def _search( load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None, + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, **kwargs: Any ) -> 'BaseTask': """ @@ -1108,6 +1095,16 @@ def _search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_ + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. kwargs: Any additional arguments that are customed by some specific task. For instance, forecasting tasks require: @@ -1116,6 +1113,7 @@ def _search( hyperparameters are determined by the default configurations custom_init_setting_path (str): The path to the initial hyperparameter configurations set by the users + Returns: self @@ -1148,13 +1146,14 @@ def _search( self._disable_file_output = disable_file_output if disable_file_output is not None else [] if ( DisableFileOutputParameters.y_optimization in self._disable_file_output - and self.ensemble_size > 1 + and ensemble_size > 1 ): self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" f" is in disable_file_output") self._memory_limit = memory_limit self._time_for_task = total_walltime_limit + # Save start time to backend self._backend.save_start_time(str(self.seed)) @@ -1218,7 +1217,7 @@ def _search( # Make sure that at least 2 models are created for the ensemble process num_models = time_left_for_modelfit // func_eval_time_limit_secs - if num_models < 2 and self.ensemble_size > 0: + if num_models < 2 and ensemble_size > 0: func_eval_time_limit_secs = time_left_for_modelfit // 2 self._logger.warning( "Capping the func_eval_time_limit_secs to {} to have " @@ -1229,7 +1228,7 @@ def _search( # ============> Run dummy predictions # We only want to run dummy predictions in case we want to build an ensemble - if self.ensemble_size > 0: + if ensemble_size > 0: dummy_task_name = 'runDummy' self._stopwatch.start_task(dummy_task_name) self._do_dummy_prediction() @@ -1238,7 +1237,7 @@ def _search( # ============> Run traditional ml # We only want to run traditional predictions in case we want to build an ensemble # We want time for at least 1 Neural network in SMAC - if enable_traditional_pipeline and self.ensemble_size > 0: + if enable_traditional_pipeline and ensemble_size > 0: traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) self.run_traditional_ml(current_task_name=self.dataset_name, runtime_limit=traditional_runtime_limit, @@ -1253,21 +1252,22 @@ def _search( if time_left_for_ensembles <= 0: # Fit only raises error when ensemble_size is not zero but # time_left_for_ensembles is zero. - if self.ensemble_size > 0: + if ensemble_size > 0: raise ValueError("Not starting ensemble builder because there " "is no time left. Try increasing the value " "of time_left_for_this_task.") - elif self.ensemble_size <= 0: + elif ensemble_size <= 0: self._logger.info("Not starting ensemble builder as ensemble size is 0") else: self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' self._stopwatch.start_task(ensemble_task_name) proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, precision=precision, - optimize_metric=self.opt_metric + optimize_metric=self.opt_metric, + max_models_on_disc=max_models_on_disc ) self._stopwatch.stop_task(ensemble_task_name) @@ -1740,6 +1740,7 @@ def fit_ensemble( precision: Optional[int] = None, ensemble_nbest: int = 50, ensemble_size: int = 50, + max_models_on_disc: int = 50, load_models: bool = True, time_for_task: int = 100, func_eval_time_limit_secs: int = 50, @@ -1755,13 +1756,16 @@ def fit_ensemble( evaluate a pipeline. if not specified, value passed to search will be used precision (Optional[int]): Numeric precision used when loading ensemble data. Can be either 16, 32 or 64. - ensemble_nbest (Optional[int]): - only consider the ensemble_nbest models to build the ensemble. - If None, uses the value stored in class attribute `ensemble_nbest`. - ensemble_size (int) (default=50): + ensemble_size (int: default=50): Number of models added to the ensemble built by Ensemble selection from libraries of models. Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. enable_traditional_pipeline (bool), (default=True): We fit traditional machine learning algorithms (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) @@ -1850,6 +1854,7 @@ def fit_ensemble( precision=precision, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc ) manager.build_ensemble(self._dask_client) @@ -1871,6 +1876,7 @@ def _init_ensemble_builder( optimize_metric: str, ensemble_nbest: int, ensemble_size: int, + max_models_on_disc: int = 50, precision: int = 32, ) -> EnsembleBuilderManager: """ @@ -1880,13 +1886,17 @@ def _init_ensemble_builder( Time (in seconds) allocated to building the ensemble optimize_metric (str): Name of the metric to optimize the ensemble. - ensemble_nbest (int): - only consider the ensemble_nbest models to build the ensemble. ensemble_size (int): Number of models added to the ensemble built by Ensemble selection from libraries of models. Models are drawn with replacement. - precision (int), (default=32): Numeric precision used when loading + ensemble_nbest (int): + Only consider the ensemble_nbest models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. It also controls the size of + the ensemble as any additional models will be deleted. + Must be greater than or equal to 1. + precision (int: default=32): Numeric precision used when loading ensemble data. Can be either 16, 32 or 64. Returns: @@ -1920,7 +1930,7 @@ def _init_ensemble_builder( opt_metric=optimize_metric, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, + max_models_on_disc=max_models_on_disc, seed=self.seed, max_iterations=None, read_at_most=sys.maxsize, diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index facb59f99..3ce8ef216 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -39,18 +39,6 @@ class TabularClassificationTask(BaseTask): number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest - models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. - Also, controls the size of the ensemble - as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -85,9 +73,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -104,9 +89,6 @@ def __init__( n_jobs=n_jobs, n_threads=n_threads, logging_config=logging_config, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, @@ -260,6 +242,9 @@ def search( load_models: bool = True, portfolio_selection: Optional[str] = None, dataset_compression: Union[Mapping[str, Any], bool] = False, + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -429,6 +414,18 @@ def search( Subsampling takes into account classification labels and stratifies accordingly. We guarantee that at least one occurrence of each label is included in the sampled set. + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest + models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. + Also, controls the size of the ensemble + as any additional models will be deleted. + Must be greater than or equal to 1. Returns: self @@ -464,6 +461,9 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, ) def predict( diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index e0c1e4eac..3c8f42aad 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -39,18 +39,6 @@ class TabularRegressionTask(BaseTask): number of threads to use for each process. logging_config (Optional[Dict]): Specifies configuration for logging, if None, it is loaded from the logging.yaml - ensemble_size (int: default=50): - Number of models added to the ensemble built by - Ensemble selection from libraries of models. - Models are drawn with replacement. - ensemble_nbest (int: default=50): - Only consider the ensemble_nbest - models to build the ensemble - max_models_on_disc (int: default=50): - Maximum number of models saved to disc. - Also, controls the size of the ensemble - as any additional models will be deleted. - Must be greater than or equal to 1. temporary_directory (str): Folder to store configuration output and log file output_directory (str): @@ -86,9 +74,6 @@ def __init__( n_jobs: int = 1, n_threads: int = 1, logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, temporary_directory: Optional[str] = None, output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, @@ -105,9 +90,6 @@ def __init__( n_jobs=n_jobs, n_threads=n_threads, logging_config=logging_config, - ensemble_size=ensemble_size, - ensemble_nbest=ensemble_nbest, - max_models_on_disc=max_models_on_disc, temporary_directory=temporary_directory, output_directory=output_directory, delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, @@ -259,7 +241,13 @@ def search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, +<<<<<<< HEAD dataset_compression: Union[Mapping[str, Any], bool] = False, +======= + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, +>>>>>>> move ensemble arguments to search function ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -390,6 +378,7 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. +<<<<<<< HEAD dataset_compression: Union[bool, Mapping[str, Any]] = True We compress datasets so that they fit into some predefined amount of memory. **NOTE** @@ -429,6 +418,20 @@ def search( Subsampling takes into account classification labels and stratifies accordingly. We guarantee that at least one occurrence of each label is included in the sampled set. +======= + ensemble_size (int: default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + ensemble_nbest (int: default=50): + Only consider the ensemble_nbest + models to build the ensemble + max_models_on_disc (int: default=50): + Maximum number of models saved to disc. + Also, controls the size of the ensemble + as any additional models will be deleted. + Must be greater than or equal to 1. +>>>>>>> move ensemble arguments to search function Returns: self @@ -465,6 +468,9 @@ def search( disable_file_output=disable_file_output, load_models=load_models, portfolio_selection=portfolio_selection, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=max_models_on_disc, ) def predict( diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 465d74c6b..1ef1611f1 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -759,7 +759,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) # Setup pre-requisites normally set by search() diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 91e4d879d..cb10c635c 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -232,7 +232,6 @@ def test_init_ensemble_builder(backend): assert proc_ensemble.opt_metric == 'accuracy' assert proc_ensemble.metrics[0] == accuracy - estimator._close_dask_client() - estimator._clean_logger() + estimator._cleanup() - del estimator + del estimator \ No newline at end of file From 8f8dee13bf5c89bfd0e038e97b6d065476721837 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 25 Dec 2021 18:44:49 +0100 Subject: [PATCH 4/6] fix flake and issue #299 --- autoPyTorch/api/base_task.py | 25 ++--- .../example_posthoc_ensemble_fit.py | 100 +++++++++--------- test/test_api/test_base_api.py | 7 +- 3 files changed, 63 insertions(+), 69 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index d9db1231c..be214ac51 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -211,6 +211,7 @@ def __init__( self._scoring_functions: Optional[List[autoPyTorchMetric]] = None self._logger: Optional[PicklableClientLogger] = None self.dataset_name: Optional[str] = None + self.dataset = Optional[BaseDataset] self.cv_models_: Dict = {} self._results_manager = ResultsManager() @@ -684,20 +685,7 @@ def _load_best_individual_model(self) -> SingleBest: run_history=self.run_history, backend=self._backend, ) - if self._logger is None: - warnings.warn( - "No valid ensemble was created. Please check the log" - "file for errors. Default to the best individual estimator:{}".format( - ensemble.identifiers_ - ) - ) - else: - self._logger.exception( - "No valid ensemble was created. Please check the log" - "file for errors. Default to the best individual estimator:{}".format( - ensemble.identifiers_ - ) - ) + return ensemble @@ -1340,7 +1328,6 @@ def _search( if proc_ensemble is not None: self._collect_results_ensemble(proc_ensemble) - self._logger.info("Closing the dask infrastructure") self._close_dask_client() self._logger.info("Finished closing the dask infrastructure") @@ -1350,6 +1337,14 @@ def _search( self._load_models() self._logger.info("Finished loading models...") + if isinstance(self.ensemble_, SingleBest) and ensemble_size > 0: + self._logger.exception( + "No valid ensemble was created. Please check the log" + "file for errors. Default to the best individual estimator:{}".format( + self.ensemble_.identifiers_ + ) + ) + self._cleanup() return self diff --git a/examples/40_advanced/example_posthoc_ensemble_fit.py b/examples/40_advanced/example_posthoc_ensemble_fit.py index cb2482b73..801f35bf1 100644 --- a/examples/40_advanced/example_posthoc_ensemble_fit.py +++ b/examples/40_advanced/example_posthoc_ensemble_fit.py @@ -24,59 +24,59 @@ from autoPyTorch.api.tabular_classification import TabularClassificationTask -if __name__ == '__main__': +############################################################################ +# Data Loading +# ============ +X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, +) - ############################################################################ - # Data Loading - # ============ - X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=42, - ) +############################################################################ +# Build and fit a classifier +# ========================== +api = TabularClassificationTask( + seed=42, +) - ############################################################################ - # Build and fit a classifier - # ========================== - api = TabularClassificationTask( - ensemble_size=0, - seed=42, - ) +############################################################################ +# Search for the best neural network +# ================================== +api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=100, + func_eval_time_limit_secs=50, + ensemble_size=0, +) - ############################################################################ - # Search for the best neural network - # ================================== - api.search( - X_train=X_train, - y_train=y_train, - X_test=X_test.copy(), - y_test=y_test.copy(), - optimize_metric='accuracy', - total_walltime_limit=100, - func_eval_time_limit_secs=50 - ) +############################################################################ +# Print the final performance of the incumbent neural network +# =========================================================== +print(api.run_history, api.trajectory) +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) - ############################################################################ - # Print the final performance of the incumbent neural network - # =========================================================== - print(api.run_history, api.trajectory) - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) +############################################################################ +# Fit an ensemble with the neural networks fitted during the search +# ================================================================= - ############################################################################ - # Fit an ensemble with the neural networks fitted during the search - # ================================================================= +api.fit_ensemble(ensemble_size=5, + # Set the enable_traditional_pipeline=True + # to also include traditional models + # in the ensemble + enable_traditional_pipeline=False) +# Print the final ensemble built by AutoPyTorch +y_pred = api.predict(X_test) +score = api.score(y_pred, y_test) +print(score) +print(api.show_models()) - api.fit_ensemble(ensemble_size=5, - # Set the enable_traditional_pipeline=True - # to also include traditional models - # in the ensemble - enable_traditional_pipeline=False) - # Print the final ensemble built by AutoPyTorch - y_pred = api.predict(X_test) - score = api.score(y_pred, y_test) - print(score) - print(api.show_models()) - api._cleanup() \ No newline at end of file +# Print statistics from search +print(api.sprint_statistics()) \ No newline at end of file diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index cb10c635c..afaff86c9 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -15,8 +15,8 @@ from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager -from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline # ==== @@ -225,8 +225,7 @@ def test_init_ensemble_builder(backend): time_left_for_ensembles=60, optimize_metric='accuracy', ensemble_nbest=10, - ensemble_size=5 - ) + ensemble_size=5) assert isinstance(proc_ensemble, EnsembleBuilderManager) assert proc_ensemble.opt_metric == 'accuracy' @@ -234,4 +233,4 @@ def test_init_ensemble_builder(backend): estimator._cleanup() - del estimator \ No newline at end of file + del estimator From 5f3d4b6ed3c32b7eff91baa1c020bc14e263e5f3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 25 Dec 2021 19:26:26 +0100 Subject: [PATCH 5/6] autoPyTorch/api/ --- test/test_api/test_api.py | 3 +-- test/test_api/test_base_api.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 1ef1611f1..d3bb71119 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -609,7 +609,6 @@ def test_tabular_input_support(openml_id, backend): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) estimator._do_dummy_prediction = unittest.mock.MagicMock() @@ -624,6 +623,7 @@ def test_tabular_input_support(openml_id, backend): func_eval_time_limit_secs=50, enable_traditional_pipeline=False, load_models=False, + ensemble_size=0, ) @@ -633,7 +633,6 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): estimator = TabularClassificationTask( backend=backend, resampling_strategy=HoldoutValTypes.holdout_validation, - ensemble_size=0, ) # Setup pre-requisites normally set by search() diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index afaff86c9..edc9499d7 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -118,7 +118,7 @@ def test_set_pipeline_config(): ]) def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected): BaseTask.__abstractmethods__ = set() - estimator = BaseTask(task_type='tabular_classification', ensemble_size=0) + estimator = BaseTask(task_type='tabular_classification') # Fixture pipeline config default_pipeline_config = { @@ -141,7 +141,7 @@ def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, bud smac_mock.return_value = smac estimator._search(optimize_metric='accuracy', dataset=dataset, tae_func=pipeline_fit, min_budget=min_budget, max_budget=max_budget, budget_type=budget_type, - enable_traditional_pipeline=False, + ensemble_size=0, enable_traditional_pipeline=False, total_walltime_limit=20, func_eval_time_limit_secs=10, load_models=False) assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config @@ -210,7 +210,6 @@ def test_init_ensemble_builder(backend): BaseTask.__abstractmethods__ = set() estimator = BaseTask( backend=backend, - ensemble_size=0, ) # Setup pre-requisites normally set by search() From 200aa7fae16109fd18eb981efc8d7ee1ed09cd78 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 25 Dec 2021 19:26:51 +0100 Subject: [PATCH 6/6] fix flake and test errors --- autoPyTorch/api/base_task.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index be214ac51..e50862948 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -686,7 +686,6 @@ def _load_best_individual_model(self) -> SingleBest: backend=self._backend, ) - return ensemble def _cleanup(self) -> None: @@ -1344,7 +1343,7 @@ def _search( self.ensemble_.identifiers_ ) ) - + self._cleanup() return self