Merge pull request #60 from HDI-Project/bcyphers/method_json_lists

Allow list hyperparameters in method JSON and add support for custom methods
HDI-Project · Jan 17, 2018 · 4f9b483 · 4f9b483
2 parents 9182af0 + a50a2e2
commit 4f9b483
Show file tree

Hide file tree

Showing 27 changed files with 470 additions and 315 deletions.
diff --git a/atm/__init__.py b/atm/__init__.py
@@ -1,5 +1,11 @@
-"""An AutoML framework.
+"""Auto Tune Models
+A multi-user, multi-data AutoML framework.
 """
 from __future__ import absolute_import
+import os
+
+# Get the path of the project root, so that the rest of the project can
+# reference files relative to there.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
 
 from . import config, constants, database, enter_data, method, metrics, model, utilities, worker
diff --git a/atm/config.py b/atm/config.py
@@ -278,8 +278,13 @@ def add_arguments_datarun(parser):
     #   pa     - passive aggressive
     #   knn    - K nearest neighbors
     #   mlp    - multi-layer perceptron
-    parser.add_argument('--methods', nargs='+', choices=METHODS,
-                        help='list of methods which the datarun will use')
+    parser.add_argument('--methods', nargs='+',
+                        type=option_or_path(METHODS, JSON_REGEX),
+                        help='Method or list of methods to use for '
+                        'classification. Each method can either be one of the '
+                        'pre-defined method codes listed below or a path to a '
+                        'JSON file defining a custom method.' +
+                        '\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS))
     parser.add_argument('--priority', type=int,
                         help='Priority of the datarun (higher = more important')
     parser.add_argument('--budget-type', choices=BUDGET_TYPES,
@@ -291,13 +296,21 @@ def add_arguments_datarun(parser):
                         'overrides the walltime budget.\nFormat: ' +
                         TIME_FMT.replace('%', '%%'))
 
-    # Which field to use for judgment of performance
+    # Which field to use to judge performance, for the sake of AutoML
     # options:
     #   f1        - F1 score (harmonic mean of precision and recall)
     #   roc_auc   - area under the Receiver Operating Characteristic curve
     #   accuracy  - percent correct
-    #   mu_sigma  - one standard deviation below the average cross-validated F1
-    #               score (mu - sigma)
+    #   cohen_kappa     - measures accuracy, but controls for chance of guessing
+    #                     correctly
+    #   rank_accuracy   - multiclass only: percent of examples for which the true
+    #                     label is in the top 1/3 most likely predicted labels
+    #   ap        - average precision: nearly identical to area under
+    #               precision/recall curve.
+    #   mcc       - matthews correlation coefficient: good for unbalanced classes
+    #
+    # f1 and roc_auc may be appended with _micro or _macro to use with
+    # multiclass problems.
     parser.add_argument('--metric', choices=METRICS,
                         help='Metric by which ATM should evaluate classifiers. '
                         'The metric function specified here will be used to '
@@ -328,7 +341,7 @@ def add_arguments_datarun(parser):
                         help='Type of BTB tuner to use. Can either be one of '
                         'the pre-configured tuners listed below or a path to a '
                         'custom tuner in the form "/path/to/tuner.py:ClassName".'
-                        '\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS))
+                        '\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS))
 
     # How should ATM select a particular hyperpartition from the set of all
     # possible hyperpartitions?
@@ -347,7 +360,7 @@ def add_arguments_datarun(parser):
                         help='Type of BTB selector to use. Can either be one of '
                         'the pre-configured selectors listed below or a path to a '
                         'custom tuner in the form "/path/to/selector.py:ClassName".'
-                        '\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS))
+                        '\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS))
 
     # r_min is the number of random runs performed in each hyperpartition before
     # allowing bayesian opt to select parameters. Consult the thesis to

diff --git a/atm/constants.py b/atm/constants.py
@@ -1,3 +1,5 @@
+import os
+from atm import PROJECT_ROOT
 # sample tuners
 from btb.tuning import Uniform as UniformTuner, GP, GPEi, GPEiVelocity
 # hyperpartition selectors
@@ -21,9 +23,12 @@
 PARTITION_STATUS = ['incomplete', 'errored', 'gridding_done']
 
 TIME_FMT = '%Y-%m-%d %H:%M'
-DATA_PATH = 'data/downloads'
+DATA_DL_PATH = os.path.join(PROJECT_ROOT, 'data/downloads')
+METHOD_PATH = os.path.join(PROJECT_ROOT, 'methods')
+LOG_PATH = os.path.join(PROJECT_ROOT, 'logs')
 
 CUSTOM_CLASS_REGEX = '(.*\.py):(\w+)$'
+JSON_REGEX = '(.*\.json)$'
 
 TUNERS_MAP = {
     'uniform': UniformTuner,

diff --git a/atm/database.py b/atm/database.py
@@ -168,7 +168,7 @@ class Hyperpartition(Base):
             datarun = relationship('Datarun', back_populates='hyperpartitions')
 
             # these columns define the partition
-            method = Column(String(15))
+            method = Column(String(255))
             categoricals64 = Column(Text)
             tunables64 = Column(Text)
             constants64 = Column(Text)

diff --git a/atm/enter_data.py b/atm/enter_data.py
@@ -137,7 +137,7 @@ def enter_datarun(sql_config, run_config, aws_config=None,
     method_parts = {}
     for m in run_config.methods:
         # enumerate all combinations of categorical variables for this method
-        method = Method(METHODS_MAP[m])
+        method = Method(m)
         method_parts[m] = method.get_hyperpartitions()
         print('method', m, 'has', len(method_parts[m]), 'hyperpartitions')
 

diff --git a/atm/method.py b/atm/method.py
@@ -1,8 +1,88 @@
+from builtins import object, str as newstr
+
 import json
 from os.path import join
-from btb import HyperParameter
 
-CONFIG_PATH = 'methods'
+import btb
+from atm.constants import METHODS_MAP, METHOD_PATH
+
+
+class HyperParameter(object):
+    @property
+    def is_categorical(self):
+        return False
+
+    @property
+    def is_constant(self):
+        return False
+
+
+class Numeric(HyperParameter):
+    def __init__(self, name, type, range):
+        self.name = name
+        self.type = type
+        self.range = range
+
+    @property
+    def is_constant(self):
+        return len(self.range) == 1
+
+    def as_tunable(self):
+        return btb.HyperParameter(typ=self.type, rang=self.range)
+
+
+class Categorical(HyperParameter):
+    def __init__(self, name, type, values):
+        self.name = name
+        self.type = type
+        for i, val in enumerate(values):
+            if val is None:
+                # the value None is allowed for every parameter type
+                continue
+            if self.type == 'int_cat':
+                values[i] = int(val)
+            elif self.type == 'float_cat':
+                values[i] = float(val)
+            elif self.type == 'string':
+                # this is necessary to avoid a bug in sklearn, which won't be
+                # fixed until 0.20
+                values[i] = str(newstr(val))
+            elif self.type == 'bool':
+                values[i] = bool(val)
+        self.values = values
+
+    @property
+    def is_categorical(self):
+        return True
+
+    @property
+    def is_constant(self):
+        return len(self.values) == 1
+
+    def as_tunable(self):
+        return btb.HyperParameter(typ=self.type, rang=self.values)
+
+
+class List(HyperParameter):
+    def __init__(self, name, type, list_length, element):
+        self.name = name
+        self.size = Categorical('len(%s)' % self.name, 'int_cat', list_length)
+        element_type = HYPERPARAMETER_TYPES[element['type']]
+        self.element = element_type('element', **element)
+
+    @property
+    def is_categorical(self):
+        return True
+
+    def get_elements(self):
+        elements = []
+        for i in range(max(self.size.values)):
+            # generate names for the pseudo-hyperparameters in the list
+            elt_name = '%s[%d]' % (self.name, i)
+            elements.append(elt_name)
+
+        conditions = {str(i): elements[:i] for i in self.size.values}
+        return elements, conditions
 
 
 class HyperPartition(object):
@@ -11,58 +91,115 @@ class HyperPartition(object):
     """
     def __init__(self, categoricals, constants, tunables):
         """
-        categoricals: the values for this hyperpartition which have been fixed, thus
-            defining the hyperpartition
-        constants: the values for this hyperpartition for which there was no choice
-        tunables: the free variables which must be tuned
+        categoricals: the hyperparameter values for this hyperpartition which
+            have been fixed, defining the hyperpartition
+        constants: the hyperparameters with only one choice
+        tunables: the numeric hyperparameters which must be tuned (of type
+            btb.HyperParameter)
         """
         self.categoricals = categoricals
         self.constants = constants
         self.tunables = tunables
 
 
+HYPERPARAMETER_TYPES = {
+    'int': Numeric,
+    'int_exp': Numeric,
+    'float': Numeric,
+    'float_exp': Numeric,
+    'int_cat': Categorical,
+    'float_cat': Categorical,
+    'string': Categorical,
+    'bool': Categorical,
+    'list': List,
+}
+
+
 class Method(object):
     """
     This class is initialized with the name of a json configuration file.
     The config contains information about a classification method and the
     hyperparameter arguments it needs to run. Its main purpose is to generate
     hyperpartitions (possible combinations of categorical hyperparameters).
     """
-    def __init__(self, config):
+    def __init__(self, method):
         """
-        config: JSON dictionary containing all the information needed to specify
-            this enumerator
+        method: method code or path to JSON file containing all the information
+            needed to specify this enumerator.
         """
-        with open(join(CONFIG_PATH, config)) as f:
+        if method in METHODS_MAP:
+            # if the configured method is a code, look up the path to its json
+            config_path = join(METHOD_PATH, METHODS_MAP[method])
+        else:
+            # otherwise, it must be a path to a file
+            config_path = method
+
+        with open(config_path) as f:
             config = json.load(f)
 
         self.name = config['name']
-        self.conditions = config['conditions']
-        self.root_params = config['root_parameters']
+        self.root_params = config['root_hyperparameters']
+        self.conditions = config['conditional_hyperparameters']
         self.class_path = config['class']
 
         # create hyperparameters from the parameter config
-        self.parameters = {k: HyperParameter(typ=v['type'], rang=v['range'])
-                           for k, v in config['parameters'].items()}
+        self.parameters = {}
+        lists = []
+        for k, v in config['hyperparameters'].items():
+            param_type = HYPERPARAMETER_TYPES[v['type']]
+            self.parameters[k] = param_type(name=k, **v)
 
+        # List hyperparameters are special. These are replaced in the
+        # CPT with a size hyperparameter and sets of element hyperparameters
+        # conditioned on the size.
+        for name, param in self.parameters.items():
+            if type(param) == List:
+                elements, conditions = param.get_elements()
+                for e in elements:
+                    self.parameters[e] = param.element
 
-    def get_hyperpartitions(self):
+                # add the size parameter, remove the list parameter
+                self.parameters[param.size.name] = param.size
+                del self.parameters[param.name]
+
+                # if this is a root param, replace its name with the new size
+                # name in the root params list
+                if param.name in self.root_params:
+                    self.root_params.append(param.size.name)
+                    self.root_params.remove(param.name)
+
+                # if this is a conditional param, replace it there instead
+                for cond, deps in self.conditions.items():
+                    if param.name in deps:
+                        deps.append(param.size.name)
+                        deps.remove(param.name)
+                        self.conditions[cond] = deps
+
+                # finally, add all the potential sets of list elements as
+                # conditions of the list's size
+                self.conditions[param.size.name] = conditions
+
+    def _sort_parameters(self, params):
         """
-        Traverse the CPT and enumerate all possible hyperpartitions of parameters
-        for this method
+        Sort a list of HyperParameter objects into lists of constants,
+        categoricals, and tunables.
         """
         constants = []
         categoricals = []
         tunables = []
-        for p in self.root_params:
-            if len(self.parameters[p].range) == 1:
-                constants.append((p, self.parameters[p].range[0]))
-            elif self.parameters[p].is_categorical:
+        for p in params:
+            param = self.parameters[p]
+            if param.is_constant:
+                if param.is_categorical:
+                    constants.append((p, param.values[0]))
+                else:
+                    constants.append((p, param.range[0]))
+            elif param.is_categorical:
                 categoricals.append(p)
             else:
-                tunables.append((p, self.parameters[p]))
+                tunables.append((p, param.as_tunable()))
 
-        return self._enumerate([], constants, categoricals, tunables)
+        return constants, categoricals, tunables
 
     def _enumerate(self, fixed_cats, constants, free_cats, tunables):
         """
@@ -88,7 +225,7 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
         # variables, and see where that takes us
         cat = free_cats.pop(0)
 
-        for val in self.parameters[cat].range:
+        for val in self.parameters[cat].values:
             # add this value to the list of qualified categoricals
             new_fixed_cats = fixed_cats + [(cat, val)]
 
@@ -103,13 +240,11 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
             # must be strings.
             if cat in self.conditions and str(val) in self.conditions[cat]:
                 # categorize the conditional variables which are now in play
-                for p in self.conditions[cat][str(val)]:
-                    if len(self.parameters[p].range) == 1:
-                        new_constants.append((p, self.parameters[p].range[0]))
-                    elif self.parameters[p].is_categorical:
-                        new_free_cats.append(p)
-                    else:
-                        new_tunables.append((p, self.parameters[p]))
+                new_params = self.conditions[cat][str(val)]
+                cons, cats, tuns = self._sort_parameters(new_params)
+                new_constants = constants + cons
+                new_free_cats = free_cats + cats
+                new_tunables = tunables + tuns
 
             # recurse with the newly qualified categorical as a constant
             parts.extend(self._enumerate(fixed_cats=new_fixed_cats,
@@ -118,3 +253,10 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
                                          tunables=new_tunables))
 
         return parts
+
+    def get_hyperpartitions(self):
+        """
+        Traverse the CPT and enumerate all possible hyperpartitions of
+        categorical parameters for this method.
+        """
+        return self._enumerate([], *self._sort_parameters(self.root_params))