diff --git a/atm/__init__.py b/atm/__init__.py index 0689baa..132d885 100644 --- a/atm/__init__.py +++ b/atm/__init__.py @@ -1,5 +1,11 @@ -"""An AutoML framework. +"""Auto Tune Models +A multi-user, multi-data AutoML framework. """ from __future__ import absolute_import +import os + +# Get the path of the project root, so that the rest of the project can +# reference files relative to there. +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) from . import config, constants, database, enter_data, method, metrics, model, utilities, worker diff --git a/atm/config.py b/atm/config.py index 941aa17..0103e5e 100644 --- a/atm/config.py +++ b/atm/config.py @@ -278,8 +278,13 @@ def add_arguments_datarun(parser): # pa - passive aggressive # knn - K nearest neighbors # mlp - multi-layer perceptron - parser.add_argument('--methods', nargs='+', choices=METHODS, - help='list of methods which the datarun will use') + parser.add_argument('--methods', nargs='+', + type=option_or_path(METHODS, JSON_REGEX), + help='Method or list of methods to use for ' + 'classification. Each method can either be one of the ' + 'pre-defined method codes listed below or a path to a ' + 'JSON file defining a custom method.' + + '\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS)) parser.add_argument('--priority', type=int, help='Priority of the datarun (higher = more important') parser.add_argument('--budget-type', choices=BUDGET_TYPES, @@ -291,13 +296,21 @@ def add_arguments_datarun(parser): 'overrides the walltime budget.\nFormat: ' + TIME_FMT.replace('%', '%%')) - # Which field to use for judgment of performance + # Which field to use to judge performance, for the sake of AutoML # options: # f1 - F1 score (harmonic mean of precision and recall) # roc_auc - area under the Receiver Operating Characteristic curve # accuracy - percent correct - # mu_sigma - one standard deviation below the average cross-validated F1 - # score (mu - sigma) + # cohen_kappa - measures accuracy, but controls for chance of guessing + # correctly + # rank_accuracy - multiclass only: percent of examples for which the true + # label is in the top 1/3 most likely predicted labels + # ap - average precision: nearly identical to area under + # precision/recall curve. + # mcc - matthews correlation coefficient: good for unbalanced classes + # + # f1 and roc_auc may be appended with _micro or _macro to use with + # multiclass problems. parser.add_argument('--metric', choices=METRICS, help='Metric by which ATM should evaluate classifiers. ' 'The metric function specified here will be used to ' @@ -328,7 +341,7 @@ def add_arguments_datarun(parser): help='Type of BTB tuner to use. Can either be one of ' 'the pre-configured tuners listed below or a path to a ' 'custom tuner in the form "/path/to/tuner.py:ClassName".' - '\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS)) + '\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS)) # How should ATM select a particular hyperpartition from the set of all # possible hyperpartitions? @@ -347,7 +360,7 @@ def add_arguments_datarun(parser): help='Type of BTB selector to use. Can either be one of ' 'the pre-configured selectors listed below or a path to a ' 'custom tuner in the form "/path/to/selector.py:ClassName".' - '\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS)) + '\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS)) # r_min is the number of random runs performed in each hyperpartition before # allowing bayesian opt to select parameters. Consult the thesis to diff --git a/atm/constants.py b/atm/constants.py index 8ffb11d..f57245e 100644 --- a/atm/constants.py +++ b/atm/constants.py @@ -1,3 +1,5 @@ +import os +from atm import PROJECT_ROOT # sample tuners from btb.tuning import Uniform as UniformTuner, GP, GPEi, GPEiVelocity # hyperpartition selectors @@ -21,9 +23,12 @@ PARTITION_STATUS = ['incomplete', 'errored', 'gridding_done'] TIME_FMT = '%Y-%m-%d %H:%M' -DATA_PATH = 'data/downloads' +DATA_DL_PATH = os.path.join(PROJECT_ROOT, 'data/downloads') +METHOD_PATH = os.path.join(PROJECT_ROOT, 'methods') +LOG_PATH = os.path.join(PROJECT_ROOT, 'logs') CUSTOM_CLASS_REGEX = '(.*\.py):(\w+)$' +JSON_REGEX = '(.*\.json)$' TUNERS_MAP = { 'uniform': UniformTuner, diff --git a/atm/database.py b/atm/database.py index 79e1668..22d3d88 100644 --- a/atm/database.py +++ b/atm/database.py @@ -168,7 +168,7 @@ class Hyperpartition(Base): datarun = relationship('Datarun', back_populates='hyperpartitions') # these columns define the partition - method = Column(String(15)) + method = Column(String(255)) categoricals64 = Column(Text) tunables64 = Column(Text) constants64 = Column(Text) diff --git a/atm/enter_data.py b/atm/enter_data.py index 59f7c8a..038baeb 100755 --- a/atm/enter_data.py +++ b/atm/enter_data.py @@ -137,7 +137,7 @@ def enter_datarun(sql_config, run_config, aws_config=None, method_parts = {} for m in run_config.methods: # enumerate all combinations of categorical variables for this method - method = Method(METHODS_MAP[m]) + method = Method(m) method_parts[m] = method.get_hyperpartitions() print('method', m, 'has', len(method_parts[m]), 'hyperpartitions') diff --git a/atm/method.py b/atm/method.py index 89199c7..4be7097 100644 --- a/atm/method.py +++ b/atm/method.py @@ -1,8 +1,88 @@ +from builtins import object, str as newstr + import json from os.path import join -from btb import HyperParameter -CONFIG_PATH = 'methods' +import btb +from atm.constants import METHODS_MAP, METHOD_PATH + + +class HyperParameter(object): + @property + def is_categorical(self): + return False + + @property + def is_constant(self): + return False + + +class Numeric(HyperParameter): + def __init__(self, name, type, range): + self.name = name + self.type = type + self.range = range + + @property + def is_constant(self): + return len(self.range) == 1 + + def as_tunable(self): + return btb.HyperParameter(typ=self.type, rang=self.range) + + +class Categorical(HyperParameter): + def __init__(self, name, type, values): + self.name = name + self.type = type + for i, val in enumerate(values): + if val is None: + # the value None is allowed for every parameter type + continue + if self.type == 'int_cat': + values[i] = int(val) + elif self.type == 'float_cat': + values[i] = float(val) + elif self.type == 'string': + # this is necessary to avoid a bug in sklearn, which won't be + # fixed until 0.20 + values[i] = str(newstr(val)) + elif self.type == 'bool': + values[i] = bool(val) + self.values = values + + @property + def is_categorical(self): + return True + + @property + def is_constant(self): + return len(self.values) == 1 + + def as_tunable(self): + return btb.HyperParameter(typ=self.type, rang=self.values) + + +class List(HyperParameter): + def __init__(self, name, type, list_length, element): + self.name = name + self.size = Categorical('len(%s)' % self.name, 'int_cat', list_length) + element_type = HYPERPARAMETER_TYPES[element['type']] + self.element = element_type('element', **element) + + @property + def is_categorical(self): + return True + + def get_elements(self): + elements = [] + for i in range(max(self.size.values)): + # generate names for the pseudo-hyperparameters in the list + elt_name = '%s[%d]' % (self.name, i) + elements.append(elt_name) + + conditions = {str(i): elements[:i] for i in self.size.values} + return elements, conditions class HyperPartition(object): @@ -11,16 +91,30 @@ class HyperPartition(object): """ def __init__(self, categoricals, constants, tunables): """ - categoricals: the values for this hyperpartition which have been fixed, thus - defining the hyperpartition - constants: the values for this hyperpartition for which there was no choice - tunables: the free variables which must be tuned + categoricals: the hyperparameter values for this hyperpartition which + have been fixed, defining the hyperpartition + constants: the hyperparameters with only one choice + tunables: the numeric hyperparameters which must be tuned (of type + btb.HyperParameter) """ self.categoricals = categoricals self.constants = constants self.tunables = tunables +HYPERPARAMETER_TYPES = { + 'int': Numeric, + 'int_exp': Numeric, + 'float': Numeric, + 'float_exp': Numeric, + 'int_cat': Categorical, + 'float_cat': Categorical, + 'string': Categorical, + 'bool': Categorical, + 'list': List, +} + + class Method(object): """ This class is initialized with the name of a json configuration file. @@ -28,41 +122,84 @@ class Method(object): hyperparameter arguments it needs to run. Its main purpose is to generate hyperpartitions (possible combinations of categorical hyperparameters). """ - def __init__(self, config): + def __init__(self, method): """ - config: JSON dictionary containing all the information needed to specify - this enumerator + method: method code or path to JSON file containing all the information + needed to specify this enumerator. """ - with open(join(CONFIG_PATH, config)) as f: + if method in METHODS_MAP: + # if the configured method is a code, look up the path to its json + config_path = join(METHOD_PATH, METHODS_MAP[method]) + else: + # otherwise, it must be a path to a file + config_path = method + + with open(config_path) as f: config = json.load(f) self.name = config['name'] - self.conditions = config['conditions'] - self.root_params = config['root_parameters'] + self.root_params = config['root_hyperparameters'] + self.conditions = config['conditional_hyperparameters'] self.class_path = config['class'] # create hyperparameters from the parameter config - self.parameters = {k: HyperParameter(typ=v['type'], rang=v['range']) - for k, v in config['parameters'].items()} + self.parameters = {} + lists = [] + for k, v in config['hyperparameters'].items(): + param_type = HYPERPARAMETER_TYPES[v['type']] + self.parameters[k] = param_type(name=k, **v) + # List hyperparameters are special. These are replaced in the + # CPT with a size hyperparameter and sets of element hyperparameters + # conditioned on the size. + for name, param in self.parameters.items(): + if type(param) == List: + elements, conditions = param.get_elements() + for e in elements: + self.parameters[e] = param.element - def get_hyperpartitions(self): + # add the size parameter, remove the list parameter + self.parameters[param.size.name] = param.size + del self.parameters[param.name] + + # if this is a root param, replace its name with the new size + # name in the root params list + if param.name in self.root_params: + self.root_params.append(param.size.name) + self.root_params.remove(param.name) + + # if this is a conditional param, replace it there instead + for cond, deps in self.conditions.items(): + if param.name in deps: + deps.append(param.size.name) + deps.remove(param.name) + self.conditions[cond] = deps + + # finally, add all the potential sets of list elements as + # conditions of the list's size + self.conditions[param.size.name] = conditions + + def _sort_parameters(self, params): """ - Traverse the CPT and enumerate all possible hyperpartitions of parameters - for this method + Sort a list of HyperParameter objects into lists of constants, + categoricals, and tunables. """ constants = [] categoricals = [] tunables = [] - for p in self.root_params: - if len(self.parameters[p].range) == 1: - constants.append((p, self.parameters[p].range[0])) - elif self.parameters[p].is_categorical: + for p in params: + param = self.parameters[p] + if param.is_constant: + if param.is_categorical: + constants.append((p, param.values[0])) + else: + constants.append((p, param.range[0])) + elif param.is_categorical: categoricals.append(p) else: - tunables.append((p, self.parameters[p])) + tunables.append((p, param.as_tunable())) - return self._enumerate([], constants, categoricals, tunables) + return constants, categoricals, tunables def _enumerate(self, fixed_cats, constants, free_cats, tunables): """ @@ -88,7 +225,7 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables): # variables, and see where that takes us cat = free_cats.pop(0) - for val in self.parameters[cat].range: + for val in self.parameters[cat].values: # add this value to the list of qualified categoricals new_fixed_cats = fixed_cats + [(cat, val)] @@ -103,13 +240,11 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables): # must be strings. if cat in self.conditions and str(val) in self.conditions[cat]: # categorize the conditional variables which are now in play - for p in self.conditions[cat][str(val)]: - if len(self.parameters[p].range) == 1: - new_constants.append((p, self.parameters[p].range[0])) - elif self.parameters[p].is_categorical: - new_free_cats.append(p) - else: - new_tunables.append((p, self.parameters[p])) + new_params = self.conditions[cat][str(val)] + cons, cats, tuns = self._sort_parameters(new_params) + new_constants = constants + cons + new_free_cats = free_cats + cats + new_tunables = tunables + tuns # recurse with the newly qualified categorical as a constant parts.extend(self._enumerate(fixed_cats=new_fixed_cats, @@ -118,3 +253,10 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables): tunables=new_tunables)) return parts + + def get_hyperpartitions(self): + """ + Traverse the CPT and enumerate all possible hyperpartitions of + categorical parameters for this method. + """ + return self._enumerate([], *self._sort_parameters(self.root_params)) diff --git a/atm/model.py b/atm/model.py index 4473d18..2851357 100644 --- a/atm/model.py +++ b/atm/model.py @@ -8,7 +8,9 @@ import pandas as pd import time import pdb +import re from importlib import import_module +from collections import defaultdict from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, MinMaxScaler @@ -44,25 +46,26 @@ class Model(object): # number of folds for cross-validation (arbitrary, for speed) N_FOLDS = 5 - def __init__(self, code, params, judgment_metric, label_column, + def __init__(self, method, params, judgment_metric, label_column, testing_ratio=0.3): """ Parameters: - code: the short method code (as defined in constants.py) + method: the short method code (as defined in constants.py) or path + to method json judgment_metric: string that indicates which metric should be optimized for. params: parameters passed to the sklearn classifier constructor class_: sklearn classifier class """ # configuration & database - self.code = code + self.method = method self.params = params self.judgment_metric = judgment_metric self.label_column = label_column self.testing_ratio = testing_ratio # load the classifier method's class - path = Method(METHODS_MAP[code]).class_path.split('.') + path = Method(method).class_path.split('.') mod_str, cls_str = '.'.join(path[:-1]), path[-1] mod = import_module(mod_str) self.class_ = getattr(mod, cls_str) @@ -122,7 +125,7 @@ def make_pipeline(self): steps.append(('minmax_scale', MinMaxScaler())) # add the classifier as the final step in the pipeline - steps.append((self.code, classifier)) + steps.append((self.method, classifier)) self.pipeline = Pipeline(steps) def cross_validate(self, X, y): @@ -214,8 +217,31 @@ def special_conversions(self, params): """ TODO: replace this logic with something better """ + # create list parameters + lists = defaultdict(list) + element_regex = re.compile('(.*)\[(\d)\]') + for name, param in params.items(): + # look for variables of the form "param_name[1]" + match = element_regex.match(name) + if match: + # name of the list parameter + lname = match.groups()[0] + # index of the list item + index = int(match.groups()[1]) + lists[lname].append((index, param)) + + # drop the element parameter from our list + del params[name] + + for lname, items in lists.items(): + # drop the list size parameter + del params['len(%s)' % lname] + + # sort the list by index + params[lname] = [val for idx, val in sorted(items)] + ## Gaussian process classifier - if self.code == "gp": + if self.method == "gp": if params["kernel"] == "constant": params["kernel"] = ConstantKernel() elif params["kernel"] == "rbf": @@ -230,39 +256,9 @@ def special_conversions(self, params): del params["alpha"] elif params["kernel"] == "exp_sine_squared": params["kernel"] = ExpSineSquared(length_scale=params["length_scale"], - periodicity=params["periodicity"]) + periodicity=params["periodicity"]) del params["length_scale"] del params["periodicity"] - ## Multi-layer perceptron - if self.code == "mlp": - - params["hidden_layer_sizes"] = [] - - # set layer topology - if int(params["num_hidden_layers"]) == 1: - params["hidden_layer_sizes"].append(params["hidden_size_layer1"]) - del params["hidden_size_layer1"] - - elif int(params["num_hidden_layers"]) == 2: - params["hidden_layer_sizes"].append(params["hidden_size_layer1"]) - params["hidden_layer_sizes"].append(params["hidden_size_layer2"]) - del params["hidden_size_layer1"] - del params["hidden_size_layer2"] - - elif int(params["num_hidden_layers"]) == 3: - params["hidden_layer_sizes"].append(params["hidden_size_layer1"]) - params["hidden_layer_sizes"].append(params["hidden_size_layer2"]) - params["hidden_layer_sizes"].append(params["hidden_size_layer3"]) - del params["hidden_size_layer1"] - del params["hidden_size_layer2"] - del params["hidden_size_layer3"] - - params["hidden_layer_sizes"] = [int(x) for x in - params["hidden_layer_sizes"]] # convert to ints - - # delete our fabricated keys - del params["num_hidden_layers"] - # return the updated parameter vector return params diff --git a/atm/utilities.py b/atm/utilities.py index 3c8a5ba..a8b474e 100644 --- a/atm/utilities.py +++ b/atm/utilities.py @@ -228,17 +228,17 @@ def get_local_data_path(data_path): if m: path = data_path[len(m.group()):].split('/') bucket = path.pop(0) - return os.path.join(DATA_PATH, path[-1]), FileType.S3 + return os.path.join(DATA_DL_PATH, path[-1]), FileType.S3 m = re.match(HTTP_PREFIX, data_path) if m: path = data_path[len(m.group()):].split('/') - return os.path.join(DATA_PATH, path[-1]), FileType.HTTP + return os.path.join(DATA_DL_PATH, path[-1]), FileType.HTTP return data_path, FileType.LOCAL -def download_file_s3(aws_path, aws_config, local_folder=DATA_PATH): +def download_file_s3(aws_path, aws_config, local_folder=DATA_DL_PATH): """ Download a file from an S3 bucket and save it in the local folder. """ # remove the prefix and extract the S3 bucket, folder, and file name m = re.match(S3_PREFIX, aws_path) @@ -274,7 +274,7 @@ def download_file_s3(aws_path, aws_config, local_folder=DATA_PATH): return path -def download_file_http(url, local_folder=DATA_PATH): +def download_file_http(url, local_folder=DATA_DL_PATH): """ Download a file from a public URL and save it locally. """ filename = url.split('/')[-1] if local_folder is not None: diff --git a/atm/worker.py b/atm/worker.py index 369c688..8a750b2 100755 --- a/atm/worker.py +++ b/atm/worker.py @@ -35,12 +35,14 @@ os.environ['GNUMPY_IMPLICIT_CONVERSION'] = 'allow' # get the file system in order +DEFAULT_MODEL_DIR = os.path.join(PROJECT_ROOT, 'models') +DEFAULT_METRIC_DIR = os.path.join(PROJECT_ROOT, 'metrics') + # make sure we have directories where we need them -LOG_DIR = 'logs' -ensure_directory(LOG_DIR) +ensure_directory(LOG_PATH) # name log file after the local hostname -LOG_FILE = os.path.join(LOG_DIR, '%s.txt' % socket.gethostname()) +LOG_FILE = os.path.join(LOG_PATH, '%s.txt' % socket.gethostname()) # how long to sleep between loops while waiting for new dataruns to be added LOOP_WAIT = 1 @@ -62,7 +64,8 @@ class ClassifierError(Exception): class Worker(object): def __init__(self, database, datarun, save_files=True, cloud_mode=False, - aws_config=None, model_dir='models', metric_dir='metrics'): + aws_config=None, model_dir=DEFAULT_MODEL_DIR, + metric_dir=DEFAULT_METRIC_DIR): """ database: Database object with connection information datarun: Datarun ORM object to work on. @@ -328,7 +331,7 @@ def test_classifier(self, method, params): classification model. Returns: Model object and metrics dictionary """ - model = Model(code=method, params=params, + model = Model(method=method, params=params, judgment_metric=self.datarun.metric, label_column=self.dataset.label_column) train_path, test_path = download_data(self.dataset.train_path, @@ -388,9 +391,9 @@ def run_classifier(self): hyperpartition.id) return - _log('Chose parameters for method %s:' % hyperpartition.method) - for k, v in params.items(): - _log('\t%s = %s' % (k, v)) + _log('Chose parameters for method "%s":' % hyperpartition.method) + for k in sorted(params.keys()): + _log('\t%s = %s' % (k, params[k])) _log('Creating classifier...') classifier = self.db.create_classifier(hyperpartition_id=hyperpartition.id, @@ -501,9 +504,11 @@ def work(db, datarun_ids=None, save_files=False, choose_randomly=True, parser.add_argument('--no-save', dest='save_files', default=True, action='store_const', const=False, help="don't save models and metrics for later") - parser.add_argument('--model-dir', dest='model_persist_dir', default='models', + parser.add_argument('--model-dir', dest='model_persist_dir', + default=DEFAULT_MODEL_DIR, help='Directory where computed models will be saved') - parser.add_argument('--metric-dir', dest='metric_persist_dir', default='metrics', + parser.add_argument('--metric-dir', dest='metric_persist_dir', + default=DEFAULT_METRIC_DIR, help='Directory where model metrics will be saved') # parse arguments and load configuration diff --git a/methods/adaboost.json b/methods/adaboost.json index 5c308bf..af117e0 100644 --- a/methods/adaboost.json +++ b/methods/adaboost.json @@ -1,16 +1,16 @@ { "name": "ada", "class": "sklearn.ensemble.AdaBoostClassifier", - "parameters": { + "hyperparameters": { "n_estimators": { - "type": "int", - "range": [25,500] + "type": "int", + "range": [25, 500] }, "learning_rate": { - "type": "float", - "range": [0.5,10] + "type": "float", + "range": [0.5, 10] } }, - "root_parameters": ["n_estimators", "learning_rate"], - "conditions": {} + "root_hyperparameters": ["n_estimators", "learning_rate"], + "conditional_hyperparameters": {} } diff --git a/methods/bernoulli_naive_bayes.json b/methods/bernoulli_naive_bayes.json index 7314673..e876c18 100644 --- a/methods/bernoulli_naive_bayes.json +++ b/methods/bernoulli_naive_bayes.json @@ -1,7 +1,7 @@ { "name": "bnb", "class": "sklearn.naive_bayes.BernoulliNB", - "parameters": { + "hyperparameters": { "alpha": { "type": "float", "range": [0.0, 1.0] @@ -11,18 +11,18 @@ "range": [0.0, 1.0] }, "fit_prior": { - "type": "int", - "range": [0, 1] + "type": "int", + "range": [0, 1] }, "class_prior": { - "type": "string", - "range": [null] + "type": "string", + "values": [null] }, "_scale": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] } }, - "root_parameters": ["alpha", "binarize", "fit_prior", "class_prior", "_scale"], - "conditions": {} + "root_hyperparameters": ["alpha", "binarize", "fit_prior", "class_prior", "_scale"], + "conditional_hyperparameters": {} } diff --git a/methods/decision_tree.json b/methods/decision_tree.json index 5e83255..ea09959 100644 --- a/methods/decision_tree.json +++ b/methods/decision_tree.json @@ -1,28 +1,28 @@ { "name": "dt", "class": "sklearn.tree.DecisionTreeClassifier", - "parameters": { + "hyperparameters": { "criterion": { "type": "string", - "range": ["entropy", "gini"] + "values": ["entropy", "gini"] }, "max_features": { "type": "float", "range": [0.1, 1.0] }, "max_depth": { - "type": "int", - "range": [2, 10] + "type": "int", + "range": [2, 10] }, "min_samples_split": { - "type": "int", - "range": [2, 4] + "type": "int", + "range": [2, 4] }, "min_samples_leaf": { - "type": "int", - "range": [1, 3] + "type": "int", + "range": [1, 3] } }, - "root_parameters": ["criterion", "max_features", "max_depth", "min_samples_split", "min_samples_leaf"], - "conditions": {} + "root_hyperparameters": ["criterion", "max_features", "max_depth", "min_samples_split", "min_samples_leaf"], + "conditional_hyperparameters": {} } diff --git a/methods/extra_trees.json b/methods/extra_trees.json index 11937eb..4602547 100644 --- a/methods/extra_trees.json +++ b/methods/extra_trees.json @@ -1,36 +1,36 @@ { "name": "et", "class": "sklearn.ensemble.ExtraTreesClassifier", - "parameters": { + "hyperparameters": { "criterion": { "type": "string", - "range": ["entropy", "gini"] + "values": ["entropy", "gini"] }, "max_features": { "type": "float", "range": [0.1, 1.0] }, "max_depth": { - "type": "int", - "range": [2, 10] + "type": "int", + "range": [2, 10] }, "min_samples_split": { - "type": "int", - "range": [2, 3] + "type": "int", + "range": [2, 3] }, "min_samples_leaf": { - "type": "int", - "range": [1, 2] + "type": "int", + "range": [1, 2] }, "n_estimators": { - "type": "int_cat", - "range": [100] + "type": "int_cat", + "values": [100] }, "n_jobs": { - "type": "int", - "range": [-1] + "type": "int", + "range": [-1] } }, - "root_parameters": ["criterion", "max_features", "max_depth", "min_samples_leaf", "min_samples_leaf", "n_estimators", "n_jobs"], - "conditions": {} + "root_hyperparameters": ["criterion", "max_features", "max_depth", "min_samples_leaf", "min_samples_leaf", "n_estimators", "n_jobs"], + "conditional_hyperparameters": {} } diff --git a/methods/gaussian_naive_bayes.json b/methods/gaussian_naive_bayes.json index aa52581..c073e16 100644 --- a/methods/gaussian_naive_bayes.json +++ b/methods/gaussian_naive_bayes.json @@ -1,12 +1,12 @@ { "name": "gnb", "class": "sklearn.naive_bayes.GaussianNB", - "parameters": { + "hyperparameters": { "_scale_minmax": { "type": "bool", - "range": [true] + "values": [true] } }, - "root_parameters": ["_scale_minmax"], - "conditions": {} + "root_hyperparameters": ["_scale_minmax"], + "conditional_hyperparameters": {} } diff --git a/methods/gaussian_process.json b/methods/gaussian_process.json index f8bad24..03404c0 100644 --- a/methods/gaussian_process.json +++ b/methods/gaussian_process.json @@ -1,30 +1,30 @@ { "name": "gp", "class": "sklearn.gaussian_process.GaussianProcessClassifier", - "parameters": { + "hyperparameters": { "kernel": { "type": "string", - "range": ["constant", "rbf", "matern", "rational_quadratic", "exp_sine_squared"] + "values": ["constant", "rbf", "matern", "rational_quadratic", "exp_sine_squared"] }, "nu": { "type": "float_cat", - "range": [0.5, 1.5, 2.5] + "values": [0.5, 1.5, 2.5] }, "length_scale": { - "type": "float_exp", - "range": [0.01, 100] + "type": "float_exp", + "range": [0.01, 100] }, "alpha": { - "type": "float", - "range": [0.0, 1.0] + "type": "float", + "range": [0.0, 1.0] }, "periodicity": { - "type": "int_cat", - "range": [0, 1] + "type": "int_cat", + "values": [0, 1] } }, - "root_parameters": ["kernel"], - "conditions": { + "root_hyperparameters": ["kernel"], + "conditional_hyperparameters": { "kernel": { "matern": ["nu"], "rational_quadratic": ["length_scale", "alpha"], diff --git a/methods/k_nearest_neighbors.json b/methods/k_nearest_neighbors.json index 3001e61..2067933 100644 --- a/methods/k_nearest_neighbors.json +++ b/methods/k_nearest_neighbors.json @@ -1,38 +1,38 @@ { "name": "knn", "class": "sklearn.neighbors.KNeighborsClassifier", - "parameters": { + "hyperparameters": { "n_neighbors": { - "type": "int", - "range": [1, 20] + "type": "int", + "range": [1, 20] }, "weights": { - "type": "string", - "range": ["uniform", "distance"] + "type": "string", + "values": ["uniform", "distance"] }, "algorithm": { - "type": "string", - "range": ["ball_tree", "kd_tree", "brute"] + "type": "string", + "values": ["ball_tree", "kd_tree", "brute"] }, "leaf_size": { - "type": "int", - "range": [1, 50] + "type": "int", + "range": [1, 50] }, "metric": { "type": "string", - "range": ["minkowski", "euclidean", "manhattan", "chebyshev"] + "values": ["minkowski", "euclidean", "manhattan", "chebyshev"] }, "p": { - "type": "int", - "range": [1, 3] + "type": "int", + "range": [1, 3] }, "_scale": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] } }, - "root_parameters": ["n_neighbors", "weights", "algorithm", "metric", "_scale"], - "conditions": { + "root_hyperparameters": ["n_neighbors", "weights", "algorithm", "metric", "_scale"], + "conditional_hyperparameters": { "metric": { "minkowski": ["p"] }, diff --git a/methods/logistic_regression.json b/methods/logistic_regression.json index 5560501..59ffd71 100644 --- a/methods/logistic_regression.json +++ b/methods/logistic_regression.json @@ -1,38 +1,38 @@ { "name": "logreg", "class": "sklearn.linear_model.LogisticRegression", - "parameters": { + "hyperparameters": { "C": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "tol": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "penalty": { - "type": "string", - "range": ["l1", "l2"] + "type": "string", + "values": ["l1", "l2"] }, "dual": { - "type": "bool", - "range": [true, false] + "type": "bool", + "values": [true, false] }, "fit_intercept": { "type": "bool", - "range": [true, false] + "values": [true, false] }, "class_weight": { - "type": "string", - "range": ["balanced"] + "type": "string", + "values": ["balanced"] }, "_scale": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] } }, - "root_parameters": ["C", "tol", "penalty", "fit_intercept", "class_weight", "_scale"], - "conditions": { + "root_hyperparameters": ["C", "tol", "penalty", "fit_intercept", "class_weight", "_scale"], + "conditional_hyperparameters": { "penalty": { "l2": ["dual"] } diff --git a/methods/multi_layer_perceptron.json b/methods/multi_layer_perceptron.json index f88231b..9e3a268 100644 --- a/methods/multi_layer_perceptron.json +++ b/methods/multi_layer_perceptron.json @@ -1,67 +1,54 @@ { "name": "mlp", "class": "sklearn.neural_network.MLPClassifier", - "parameters": { + "hyperparameters": { "batch_size": { - "type": "string", - "range": ["auto"] + "type": "string", + "values": ["auto"] }, "solver": { - "type": "string", - "range": ["lbfgs", "sgd", "adam"] + "type": "string", + "values": ["lbfgs", "sgd", "adam"] }, "alpha": { - "type": "float", - "range": [0.0001, 0.009] - }, - "num_hidden_layers": { - "type": "int_cat", - "range": [1, 2, 3] - }, - "hidden_size_layer1": { - "type": "int", - "range": [2, 300] - }, - "hidden_size_layer2": { - "type": "int", - "range": [2, 300] - }, - "hidden_size_layer3": { - "type": "int", - "range": [2, 300] + "type": "float", + "range": [0.0001, 0.009] }, "learning_rate_init": { - "type": "float", - "range": [0.001, 0.99] + "type": "float", + "range": [0.001, 0.99] }, "beta_1": { - "type": "float", - "range": [0.8, 0.9999] + "type": "float", + "range": [0.8, 0.9999] }, "beta_2": { - "type": "float", - "range": [0.8, 0.9999] + "type": "float", + "range": [0.8, 0.9999] }, "learning_rate": { - "type": "string", - "range": ["constant", "invscaling", "adaptive"] + "type": "string", + "values": ["constant", "invscaling", "adaptive"] }, "activation": { - "type": "string", - "range": ["relu", "logistic", "identity", "tanh"] + "type": "string", + "values": ["relu", "logistic", "identity", "tanh"] + }, + "hidden_layer_sizes": { + "type": "list", + "list_length": [1, 2, 3], + "element": { + "type": "int", + "range": [2, 300] + } }, "_scale": { - "type": "string", - "range": [true] + "type": "string", + "values": [true] } }, - "root_parameters": ["batch_size", "solver", "alpha", "activation", "num_hidden_layers", "_scale"], - "conditions": { - "num_hidden_layers": { - "1": ["hidden_size_layer1"], - "2": ["hidden_size_layer1", "hidden_size_layer2"], - "3": ["hidden_size_layer1", "hidden_size_layer2", "hidden_size_layer3"] - }, + "root_hyperparameters": ["batch_size", "solver", "alpha", "activation", "hidden_layer_sizes", "_scale"], + "conditional_hyperparameters": { "solver": { "sgd": ["learning_rate_init", "learning_rate"], "adam": ["learning_rate_init", "beta_1", "beta_2"] diff --git a/methods/multinomial_naive_bayes.json b/methods/multinomial_naive_bayes.json index 0fa5e7f..49eeb3e 100644 --- a/methods/multinomial_naive_bayes.json +++ b/methods/multinomial_naive_bayes.json @@ -1,24 +1,24 @@ { "name": "mnb", "class": "sklearn.naive_bayes.MultinomialNB", - "parameters": { + "hyperparameters": { "alpha": { "type": "float", "range": [0.0, 1.0] }, "fit_prior": { - "type": "int", - "range": [0, 1] + "type": "int", + "range": [0, 1] }, "class_prior": { - "type": "string", - "range": [null] + "type": "string", + "values": [null] }, "_scale_minmax": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] } }, - "root_parameters": ["alpha", "fit_prior", "class_prior", "_scale_minmax"], - "conditions": {} + "root_hyperparameters": ["alpha", "fit_prior", "class_prior", "_scale_minmax"], + "conditional_hyperparameters": {} } diff --git a/methods/passive_aggressive.json b/methods/passive_aggressive.json index bd3e354..0b42cf9 100644 --- a/methods/passive_aggressive.json +++ b/methods/passive_aggressive.json @@ -1,36 +1,36 @@ { "name": "pa", "class": "sklearn.linear_model.PassiveAggressiveClassifier", - "parameters": { + "hyperparameters": { "C": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "fit_intercept": { - "type": "int_cat", - "range": [0, 1] + "type": "int_cat", + "values": [0, 1] }, "n_iter": { - "type": "int", - "range": [10, 200] + "type": "int", + "range": [10, 200] }, "shuffle": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] }, "loss": { "type": "string", - "range": ["hinge", "squared_hinge"] + "values": ["hinge", "squared_hinge"] }, "_scale": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] }, "n_jobs": { - "type": "int_cat", - "range": [-1] + "type": "int_cat", + "values": [-1] } }, - "root_parameters": ["C", "fit_intercept", "n_iter", "shuffle", "loss", "_scale", "n_jobs"], - "conditions": {} + "root_hyperparameters": ["C", "fit_intercept", "n_iter", "shuffle", "loss", "_scale", "n_jobs"], + "conditional_hyperparameters": {} } diff --git a/methods/random_forest.json b/methods/random_forest.json index 7b600e2..d33d1c7 100644 --- a/methods/random_forest.json +++ b/methods/random_forest.json @@ -1,36 +1,36 @@ { "name": "rf", "class": "sklearn.ensemble.RandomForestClassifier", - "parameters": { + "hyperparameters": { "criterion": { "type": "string", - "range": ["entropy", "gini"] + "values": ["entropy", "gini"] }, "max_features": { "type": "float", "range": [0.1, 1.0] }, "max_depth": { - "type": "int", - "range": [2, 10] + "type": "int", + "range": [2, 10] }, "min_samples_split": { - "type": "int", - "range": [2, 4] + "type": "int", + "range": [2, 4] }, "min_samples_leaf": { - "type": "int", - "range": [1, 3] + "type": "int", + "range": [1, 3] }, "n_estimators": { - "type": "int_cat", - "range": [100] + "type": "int_cat", + "values": [100] }, "n_jobs": { - "type": "int_cat", - "range": [-1] + "type": "int_cat", + "values": [-1] } }, - "root_parameters": ["criterion", "max_features", "max_depth", "min_samples_leaf", "min_samples_leaf", "n_estimators", "n_jobs"], - "conditions": {} + "root_hyperparameters": ["criterion", "max_features", "max_depth", "min_samples_leaf", "min_samples_leaf", "n_estimators", "n_jobs"], + "conditional_hyperparameters": {} } diff --git a/methods/stochastic_gradient_descent.json b/methods/stochastic_gradient_descent.json index c114968..9e63983 100644 --- a/methods/stochastic_gradient_descent.json +++ b/methods/stochastic_gradient_descent.json @@ -1,60 +1,60 @@ { "name": "sgd", "class": "sklearn.linear_model.SGDClassifier", - "parameters": { + "hyperparameters": { "loss": { "type": "string", - "range": ["hinge", "log", "modified_huber", "squared_hinge"] + "values": ["hinge", "log", "modified_huber", "squared_hinge"] }, "penalty": { "type": "string", - "range": ["l1", "l2", "elasticnet"] + "values": ["l1", "l2", "elasticnet"] }, "alpha": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "l1_ratio": { - "type": "float", - "range": [0.0, 1.0] + "type": "float", + "range": [0.0, 1.0] }, "fit_intercept": { - "type": "int_cat", - "range": [0, 1] + "type": "int", + "range": [0, 1] }, "n_iter": { - "type": "int", - "range": [10, 200] + "type": "int", + "range": [10, 200] }, "shuffle": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] }, "epsilon": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "learning_rate": { - "type": "string", - "range": ["constant", "optimal"] + "type": "string", + "values": ["constant", "optimal"] }, "eta0": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "class_weight": { - "type": "string", - "range": [null] + "type": "string", + "values": [null] }, "_scale_minmax": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] }, "n_jobs": { - "type": "int_cat", - "range": [-1] + "type": "int_cat", + "values": [-1] } }, - "root_parameters": ["loss", "penalty", "alpha", "l1_ratio", "fit_intercept", "n_iter", "shuffle", "epsilon", "learning_rate", "eta0", "class_weight", "_scale_minmax", "n_jobs"], - "conditions": {} + "root_hyperparameters": ["loss", "penalty", "alpha", "l1_ratio", "fit_intercept", "n_iter", "shuffle", "epsilon", "learning_rate", "eta0", "class_weight", "_scale_minmax", "n_jobs"], + "conditional_hyperparameters": {} } diff --git a/methods/support_vector_machine.json b/methods/support_vector_machine.json index 515a634..b0fd342 100644 --- a/methods/support_vector_machine.json +++ b/methods/support_vector_machine.json @@ -1,22 +1,22 @@ { "name": "svm", "class": "sklearn.svm.SVC", - "parameters": { + "hyperparameters": { "C": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "gamma": { - "type": "float_exp", - "range": [1e-5, 1e5] + "type": "float_exp", + "range": [1e-5, 1e5] }, "kernel": { - "type": "string", - "range": ["rbf", "poly", "linear", "sigmoid"] + "type": "string", + "values": ["rbf", "poly", "linear", "sigmoid"] }, "degree": { - "type": "int", - "range": [2, 5] + "type": "int", + "range": [2, 5] }, "coef0": { "type": "int", @@ -24,31 +24,31 @@ }, "probability": { "type": "bool", - "range": [true] + "values": [true] }, "shrinking": { "type": "bool", - "range": [true] + "values": [true] }, "cache_size": { - "type": "int", - "range": [15000] + "type": "int", + "range": [15000] }, "class_weight": { - "type": "string", - "range": ["balanced"] + "type": "string", + "values": ["balanced"] }, "_scale": { - "type": "bool", - "range": [true] + "type": "bool", + "values": [true] }, "max_iter": { - "type": "int", - "range": [50000] + "type": "int", + "range": [50000] } }, - "root_parameters": ["C", "kernel", "probability", "shrinking", "cache_size", "class_weight", "max_iter", "_scale"], - "conditions": { + "root_hyperparameters": ["C", "kernel", "probability", "shrinking", "cache_size", "class_weight", "max_iter", "_scale"], + "conditional_hyperparameters": { "kernel": { "rbf": ["gamma"], "sigmoid": ["gamma", "coef0"], diff --git a/test/btb_test.py b/test/btb_test.py index 6c759b3..7f6486f 100644 --- a/test/btb_test.py +++ b/test/btb_test.py @@ -4,6 +4,7 @@ import random from os.path import join +from atm import PROJECT_ROOT from atm.config import * from atm.database import Database from atm.enter_data import enter_datarun @@ -11,7 +12,7 @@ from utilities import * -CONF_DIR = 'config/test/btb/' +CONF_DIR = os.path.join(PROJECT_ROOT, 'config/test/btb/') RUN_CONFIG = join(CONF_DIR, 'run.yaml') SQL_CONFIG = join(CONF_DIR, 'sql.yaml') diff --git a/test/end_to_end_test.py b/test/end_to_end_test.py index b40e6a6..4e1d53a 100644 --- a/test/end_to_end_test.py +++ b/test/end_to_end_test.py @@ -15,8 +15,8 @@ from utilities import * -CONF_DIR = 'config/test/end_to_end/' -DATA_DIR = 'data/test/' +CONF_DIR = os.path.join(PROJECT_ROOT, 'config/test/end_to_end/') +DATA_DIR = os.path.join(PROJECT_ROOT, 'data/test/') RUN_CONFIG = join(CONF_DIR, 'run.yaml') SQL_CONFIG = join(CONF_DIR, 'sql.yaml') diff --git a/test/method_test.py b/test/method_test.py index 35c065d..f6805d0 100644 --- a/test/method_test.py +++ b/test/method_test.py @@ -15,8 +15,8 @@ from utilities import * -CONF_DIR = 'config/test/method/' -DATA_DIR = 'data/test/' +CONF_DIR = os.path.join(PROJECT_ROOT, 'config/test/method/') +DATA_DIR = os.path.join(PROJECT_ROOT, 'data/test/') RUN_CONFIG = join(CONF_DIR, 'run.yaml') SQL_CONFIG = join(CONF_DIR, 'sql.yaml') DATASETS = [ diff --git a/test/utilities.py b/test/utilities.py index ad767e3..dc0d55e 100644 --- a/test/utilities.py +++ b/test/utilities.py @@ -17,7 +17,7 @@ except ImportError: plt = None -BASELINE_PATH = 'test/baselines/best_so_far/' +BASELINE_PATH = os.path.join(PROJECT_ROOT, 'test/baselines/best_so_far/') DATA_URL = 'https://s3.amazonaws.com/mit-dai-delphi-datastore/downloaded/' BASELINE_URL = 'https://s3.amazonaws.com/mit-dai-delphi-datastore/best_so_far/'