diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 94add94bd..8687b77cd 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1,3 +1,18 @@ +"""Base class for tasks to solve +* The shared components among all the tasks +* This module provides the optimization given a pipeline +* This module plays a role of communicating with + distributed clients + +TODO: + * Separate the training procedure by another class and encapsulate it + * Separate _do_dummy_prediction and refactor it + * Separate _do_traditional_prediction and refactor it + * Refactor _search + * Reduce unimportant instance variables + * Use private variables and public variables by _ +""" + import copy import json import logging.handlers diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py index 652a546b9..de77f440d 100644 --- a/autoPyTorch/constants.py +++ b/autoPyTorch/constants.py @@ -1,3 +1,10 @@ +"""Constant variables in AutoPytorch + +TODO: + * Make everything enumerators + * Avoid the usage of integers +""" + TABULAR_CLASSIFICATION = 1 IMAGE_CLASSIFICATION = 2 TABULAR_REGRESSION = 3 diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 2ef02ceba..6955dff8b 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,3 +1,15 @@ +"""Base class for the feature validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The feature validator for each task inherits this class +* Check if the provided feature can be processed in AutoPytorch + +TODO: + * SUPPORTED_FEAT_TYPES --> Enumerator + * Describe the shape of X + * typing. --> + * logging.Logger --> Logger +""" + import logging import typing diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 44e73d42a..e3018e839 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,3 +1,19 @@ +"""Base class for the target (or label) validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The target validator for each task inherits this class +* Check if the provided targets (or labels) are compatible in both + training and test + +TODO: + * SUPPORTED_FEAT_TYPES --> Enumerator + * Describe the shape of y + * typing. --> + * logging.Logger --> Logger + * Rename classes_ --> get_classes + * Check the return of classes_ + * is_single_column_target --> is_target_scalar +""" + import logging import typing @@ -31,12 +47,13 @@ class BaseTargetValidator(BaseEstimator): """ A class to pre-process targets. It validates the data provided during fit (to make sure it matches AutoPyTorch expectation) as well as encoding the targets in case of classification + Attributes: is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. encoder (typing.Optional[BaseEstimator]): - Host a encoder object if the data requires transformation (for example, + Host an encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) enc_columns (typing.List[str]) List of columns that where encoded @@ -175,7 +192,7 @@ def classes_(self) -> np.ndarray: Complies with scikit learn classes_ attribute, which consist of a ndarray of shape (n_classes,) where n_classes are the number of classes seen while fitting - a encoder to the targets. + an encoder to the targets. Returns: classes_: np.ndarray The unique classes seen during encoding of a classifier diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 7528d56ab..ff782c526 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -1,3 +1,16 @@ +"""Base class for the input validator given a task +* A wrapper class of the sklearn.base.BaseEstimator +* The input validator for each task inherits this class +* Check if the provided data are compatible with AutoPytorch implementation +* Manage both target_ and feature_validator in this class + +TODO: + * typing. --> + * logging.Logger --> Logger + * Inherit feature_validator and target_validator from a child class + via super().__init__() +""" + # -*- encoding: utf-8 -*- import logging.handlers import typing diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 15a6dedf9..8393140e0 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -1,3 +1,19 @@ +"""Base class of the provided dataset +* Provide data validation splits based on types of data +* Provide API to return training and validation splits +* Storage the properties of the dataset which are required + in AutoPytorch implementation + +TODO: + * Address: https://github.com/automl/Auto-PyTorch/pull/108/ + * Make BaseDatasetPropertiesType more informative + * Use private variables and public variables properly + * Consider more memory-efficient way to store splits + ==> It will be so much memory consumption for huge datasets + * Check the usage of validation and test because cross validation + only uses the training dataset +""" + import os import uuid from abc import ABCMeta diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index ac96c934a..0df54866c 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -1,3 +1,19 @@ +"""Functions for resampling strategy or cross validation +* Each function is used in BaseDataset to provide dataset splits + +TODO: + * DEFAULT_RESAMPLING_PARAMETERS --> keyword arguments + * documentation strings + * Make shuffle and stratified arguments rather than + independent methods + * Force the instantiation of each splitting methods + ==> instance variables tell you what kind of splitting + * Delete protocol and enumerator because we do not need + once we make them classes that require instantiation + * resampling_strategy --> splitting_fn + * resampling_strategy_args --> splitting_params +""" + from enum import IntEnum from typing import Any, Dict, List, Optional, Tuple, Union diff --git a/autoPyTorch/ensemble/abstract_ensemble.py b/autoPyTorch/ensemble/abstract_ensemble.py index 072b6d260..6c22d5ced 100644 --- a/autoPyTorch/ensemble/abstract_ensemble.py +++ b/autoPyTorch/ensemble/abstract_ensemble.py @@ -1,3 +1,11 @@ +"""The abstract class of ensemble classes +* Provide methods that must be overridden by the child class + +TODO: + * Add `raise NotImplementedError` + * model_identifiers --> List[] +""" + from abc import ABCMeta, abstractmethod from typing import Any, Dict, List, Tuple, Union diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py index a22d413f7..163eb97df 100644 --- a/autoPyTorch/ensemble/ensemble_builder.py +++ b/autoPyTorch/ensemble/ensemble_builder.py @@ -1,3 +1,30 @@ +"""The module that enables a build ensemble +* EnsembleBuilderManager serves as a central system that + submit an EnsembleBuilder to dask +* EnsembleBuilder builds an ensemble using pynisher + so that we can easily suppress the memory usage and runtime +* EnsembleBuilder builds an ensemble using the configurations + that are observed in HPO + +TODO: + * Unused arguments in EnsembleBuilderManager.__call__ + * Remove the argument `unit_test` and separate methods + with patch.object(, '', side_effect=MemoryError): + inst = (arguments) + inst.() <== MemoryError + + * Remove unneeded comments + * Make precision in a better way (enum, np.int32 ...) + * Separate `raise Error` methods in EnsembleBuilder + + run + + main + + compute_loss_per_model + + get_n_best_preds + * Separate more general function from EnsembleBuilder + + get_disk_consumption + + _read_np_fn +""" + # -*- encoding: utf-8 -*- import glob import gzip diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py index b8f379e55..607533651 100644 --- a/autoPyTorch/ensemble/ensemble_selection.py +++ b/autoPyTorch/ensemble/ensemble_selection.py @@ -1,3 +1,26 @@ +"""The title of the module description # noqa +* Describe at the beginning of the source code. +* Describe before the package imports + +TODO: + * Add the following + References: + Title: Ensemble Selection from Libraries of Models + Authors: Rich Caruana et. al. + URL: https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf + + * `A copy of self` --> check if it is really true + * Change `_` to `_` + * get_models_with_weights --> looks sort by descending of weights + * soft voting ==> explanation + References: + Title: Consensus Based Ensembles of Soft Clusterings + Authors: Kunal Punera and Joydeep Ghosh + URL: https://www.researchgate.net/profile/Joydeep-Ghosh-8/publication/221188694_Consensus_Based_Ensembles_of_Soft_Clusterings/links/02e7e521fe367e06c3000000/Consensus-Based-Ensembles-of-Soft-Clusterings.pdf + * _calculate_weights ==> what about np.sum(weights) > 1?? + * Refactor _fit() and add the shape of predictions +""" + from collections import Counter from typing import Any, Dict, List, Tuple, Union diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py index 881ae5fd2..78c8cd5d7 100644 --- a/autoPyTorch/ensemble/singlebest_ensemble.py +++ b/autoPyTorch/ensemble/singlebest_ensemble.py @@ -1,3 +1,15 @@ +"""Backup solution class for the crached searching +* Provide the best configuration instead of an ensemble + with multiple models + +TODO: + * Change `_` to `_` + * Add more `raise ` since this class is supposed + to be used in very specific situations + * Check the contexts where this class is called because + self.weights_ and self.indices_ are not clear enough +""" + import os from typing import Any, Dict, List, Tuple, Union diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 0ba588276..893f20a14 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -1,3 +1,39 @@ +"""This module provides model estimator pipelines +This module has the following pipelines: + - MyTraditionalTabularClassificationPipeline + Wrapper class for traditional ML classification methods + such as CatBoost, RandomForest + - MyTraditionalTabularRegressionPipeline + Wrapper class for traditional ML regression methods + such as RandomForest + - DummyClassificationPipeline + Wrapper class for dummy classifier in sklearn + - DummyRegressionPipeline + Wrapper class for dummy regressor in sklearn + - AbstractEvaluator + The interface for the pipeline evaluators + to optimize via SMAC + +Note: Dummy model is an estimator using a very simple rule + and this is used for the minimum baseline for each task. + https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html # noqa: W291 + https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html # noqa: W291 + +TODO: + * Describe the definition of sample_weight + * import autoPyTorch.pipeline.xxx as shorter names + * Describe the shape of returns in predict and predict_proba + * Improve the documentation of additional_run_info + * Change get_pipeline_representation --> __repr__ + * delete self.random_state, self.init_params, self.config, + self.dataset_properties, + (because they are not used) + * [named_step](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) # noqa: W291 + * The typing of config in DummyXXXPipeline + * Add enumerator for additional_run_info + * Rename fit_and_suppress_warnings +""" + import logging.handlers import time import warnings