Skip to content

Commit

Permalink
Merge pull request #60 from HDI-Project/bcyphers/method_json_lists
Browse files Browse the repository at this point in the history
Allow list hyperparameters in method JSON and add support for custom methods
  • Loading branch information
Bennett Cyphers authored Jan 17, 2018
2 parents 9182af0 + a50a2e2 commit 4f9b483
Show file tree
Hide file tree
Showing 27 changed files with 470 additions and 315 deletions.
8 changes: 7 additions & 1 deletion atm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""An AutoML framework.
"""Auto Tune Models
A multi-user, multi-data AutoML framework.
"""
from __future__ import absolute_import
import os

# Get the path of the project root, so that the rest of the project can
# reference files relative to there.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

from . import config, constants, database, enter_data, method, metrics, model, utilities, worker
27 changes: 20 additions & 7 deletions atm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,13 @@ def add_arguments_datarun(parser):
# pa - passive aggressive
# knn - K nearest neighbors
# mlp - multi-layer perceptron
parser.add_argument('--methods', nargs='+', choices=METHODS,
help='list of methods which the datarun will use')
parser.add_argument('--methods', nargs='+',
type=option_or_path(METHODS, JSON_REGEX),
help='Method or list of methods to use for '
'classification. Each method can either be one of the '
'pre-defined method codes listed below or a path to a '
'JSON file defining a custom method.' +
'\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS))
parser.add_argument('--priority', type=int,
help='Priority of the datarun (higher = more important')
parser.add_argument('--budget-type', choices=BUDGET_TYPES,
Expand All @@ -291,13 +296,21 @@ def add_arguments_datarun(parser):
'overrides the walltime budget.\nFormat: ' +
TIME_FMT.replace('%', '%%'))

# Which field to use for judgment of performance
# Which field to use to judge performance, for the sake of AutoML
# options:
# f1 - F1 score (harmonic mean of precision and recall)
# roc_auc - area under the Receiver Operating Characteristic curve
# accuracy - percent correct
# mu_sigma - one standard deviation below the average cross-validated F1
# score (mu - sigma)
# cohen_kappa - measures accuracy, but controls for chance of guessing
# correctly
# rank_accuracy - multiclass only: percent of examples for which the true
# label is in the top 1/3 most likely predicted labels
# ap - average precision: nearly identical to area under
# precision/recall curve.
# mcc - matthews correlation coefficient: good for unbalanced classes
#
# f1 and roc_auc may be appended with _micro or _macro to use with
# multiclass problems.
parser.add_argument('--metric', choices=METRICS,
help='Metric by which ATM should evaluate classifiers. '
'The metric function specified here will be used to '
Expand Down Expand Up @@ -328,7 +341,7 @@ def add_arguments_datarun(parser):
help='Type of BTB tuner to use. Can either be one of '
'the pre-configured tuners listed below or a path to a '
'custom tuner in the form "/path/to/tuner.py:ClassName".'
'\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS))
'\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS))

# How should ATM select a particular hyperpartition from the set of all
# possible hyperpartitions?
Expand All @@ -347,7 +360,7 @@ def add_arguments_datarun(parser):
help='Type of BTB selector to use. Can either be one of '
'the pre-configured selectors listed below or a path to a '
'custom tuner in the form "/path/to/selector.py:ClassName".'
'\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS))
'\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS))

# r_min is the number of random runs performed in each hyperpartition before
# allowing bayesian opt to select parameters. Consult the thesis to
Expand Down
7 changes: 6 additions & 1 deletion atm/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
from atm import PROJECT_ROOT
# sample tuners
from btb.tuning import Uniform as UniformTuner, GP, GPEi, GPEiVelocity
# hyperpartition selectors
Expand All @@ -21,9 +23,12 @@
PARTITION_STATUS = ['incomplete', 'errored', 'gridding_done']

TIME_FMT = '%Y-%m-%d %H:%M'
DATA_PATH = 'data/downloads'
DATA_DL_PATH = os.path.join(PROJECT_ROOT, 'data/downloads')
METHOD_PATH = os.path.join(PROJECT_ROOT, 'methods')
LOG_PATH = os.path.join(PROJECT_ROOT, 'logs')

CUSTOM_CLASS_REGEX = '(.*\.py):(\w+)$'
JSON_REGEX = '(.*\.json)$'

TUNERS_MAP = {
'uniform': UniformTuner,
Expand Down
2 changes: 1 addition & 1 deletion atm/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class Hyperpartition(Base):
datarun = relationship('Datarun', back_populates='hyperpartitions')

# these columns define the partition
method = Column(String(15))
method = Column(String(255))
categoricals64 = Column(Text)
tunables64 = Column(Text)
constants64 = Column(Text)
Expand Down
2 changes: 1 addition & 1 deletion atm/enter_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def enter_datarun(sql_config, run_config, aws_config=None,
method_parts = {}
for m in run_config.methods:
# enumerate all combinations of categorical variables for this method
method = Method(METHODS_MAP[m])
method = Method(m)
method_parts[m] = method.get_hyperpartitions()
print('method', m, 'has', len(method_parts[m]), 'hyperpartitions')

Expand Down
204 changes: 173 additions & 31 deletions atm/method.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,88 @@
from builtins import object, str as newstr

import json
from os.path import join
from btb import HyperParameter

CONFIG_PATH = 'methods'
import btb
from atm.constants import METHODS_MAP, METHOD_PATH


class HyperParameter(object):
@property
def is_categorical(self):
return False

@property
def is_constant(self):
return False


class Numeric(HyperParameter):
def __init__(self, name, type, range):
self.name = name
self.type = type
self.range = range

@property
def is_constant(self):
return len(self.range) == 1

def as_tunable(self):
return btb.HyperParameter(typ=self.type, rang=self.range)


class Categorical(HyperParameter):
def __init__(self, name, type, values):
self.name = name
self.type = type
for i, val in enumerate(values):
if val is None:
# the value None is allowed for every parameter type
continue
if self.type == 'int_cat':
values[i] = int(val)
elif self.type == 'float_cat':
values[i] = float(val)
elif self.type == 'string':
# this is necessary to avoid a bug in sklearn, which won't be
# fixed until 0.20
values[i] = str(newstr(val))
elif self.type == 'bool':
values[i] = bool(val)
self.values = values

@property
def is_categorical(self):
return True

@property
def is_constant(self):
return len(self.values) == 1

def as_tunable(self):
return btb.HyperParameter(typ=self.type, rang=self.values)


class List(HyperParameter):
def __init__(self, name, type, list_length, element):
self.name = name
self.size = Categorical('len(%s)' % self.name, 'int_cat', list_length)
element_type = HYPERPARAMETER_TYPES[element['type']]
self.element = element_type('element', **element)

@property
def is_categorical(self):
return True

def get_elements(self):
elements = []
for i in range(max(self.size.values)):
# generate names for the pseudo-hyperparameters in the list
elt_name = '%s[%d]' % (self.name, i)
elements.append(elt_name)

conditions = {str(i): elements[:i] for i in self.size.values}
return elements, conditions


class HyperPartition(object):
Expand All @@ -11,58 +91,115 @@ class HyperPartition(object):
"""
def __init__(self, categoricals, constants, tunables):
"""
categoricals: the values for this hyperpartition which have been fixed, thus
defining the hyperpartition
constants: the values for this hyperpartition for which there was no choice
tunables: the free variables which must be tuned
categoricals: the hyperparameter values for this hyperpartition which
have been fixed, defining the hyperpartition
constants: the hyperparameters with only one choice
tunables: the numeric hyperparameters which must be tuned (of type
btb.HyperParameter)
"""
self.categoricals = categoricals
self.constants = constants
self.tunables = tunables


HYPERPARAMETER_TYPES = {
'int': Numeric,
'int_exp': Numeric,
'float': Numeric,
'float_exp': Numeric,
'int_cat': Categorical,
'float_cat': Categorical,
'string': Categorical,
'bool': Categorical,
'list': List,
}


class Method(object):
"""
This class is initialized with the name of a json configuration file.
The config contains information about a classification method and the
hyperparameter arguments it needs to run. Its main purpose is to generate
hyperpartitions (possible combinations of categorical hyperparameters).
"""
def __init__(self, config):
def __init__(self, method):
"""
config: JSON dictionary containing all the information needed to specify
this enumerator
method: method code or path to JSON file containing all the information
needed to specify this enumerator.
"""
with open(join(CONFIG_PATH, config)) as f:
if method in METHODS_MAP:
# if the configured method is a code, look up the path to its json
config_path = join(METHOD_PATH, METHODS_MAP[method])
else:
# otherwise, it must be a path to a file
config_path = method

with open(config_path) as f:
config = json.load(f)

self.name = config['name']
self.conditions = config['conditions']
self.root_params = config['root_parameters']
self.root_params = config['root_hyperparameters']
self.conditions = config['conditional_hyperparameters']
self.class_path = config['class']

# create hyperparameters from the parameter config
self.parameters = {k: HyperParameter(typ=v['type'], rang=v['range'])
for k, v in config['parameters'].items()}
self.parameters = {}
lists = []
for k, v in config['hyperparameters'].items():
param_type = HYPERPARAMETER_TYPES[v['type']]
self.parameters[k] = param_type(name=k, **v)

# List hyperparameters are special. These are replaced in the
# CPT with a size hyperparameter and sets of element hyperparameters
# conditioned on the size.
for name, param in self.parameters.items():
if type(param) == List:
elements, conditions = param.get_elements()
for e in elements:
self.parameters[e] = param.element

def get_hyperpartitions(self):
# add the size parameter, remove the list parameter
self.parameters[param.size.name] = param.size
del self.parameters[param.name]

# if this is a root param, replace its name with the new size
# name in the root params list
if param.name in self.root_params:
self.root_params.append(param.size.name)
self.root_params.remove(param.name)

# if this is a conditional param, replace it there instead
for cond, deps in self.conditions.items():
if param.name in deps:
deps.append(param.size.name)
deps.remove(param.name)
self.conditions[cond] = deps

# finally, add all the potential sets of list elements as
# conditions of the list's size
self.conditions[param.size.name] = conditions

def _sort_parameters(self, params):
"""
Traverse the CPT and enumerate all possible hyperpartitions of parameters
for this method
Sort a list of HyperParameter objects into lists of constants,
categoricals, and tunables.
"""
constants = []
categoricals = []
tunables = []
for p in self.root_params:
if len(self.parameters[p].range) == 1:
constants.append((p, self.parameters[p].range[0]))
elif self.parameters[p].is_categorical:
for p in params:
param = self.parameters[p]
if param.is_constant:
if param.is_categorical:
constants.append((p, param.values[0]))
else:
constants.append((p, param.range[0]))
elif param.is_categorical:
categoricals.append(p)
else:
tunables.append((p, self.parameters[p]))
tunables.append((p, param.as_tunable()))

return self._enumerate([], constants, categoricals, tunables)
return constants, categoricals, tunables

def _enumerate(self, fixed_cats, constants, free_cats, tunables):
"""
Expand All @@ -88,7 +225,7 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
# variables, and see where that takes us
cat = free_cats.pop(0)

for val in self.parameters[cat].range:
for val in self.parameters[cat].values:
# add this value to the list of qualified categoricals
new_fixed_cats = fixed_cats + [(cat, val)]

Expand All @@ -103,13 +240,11 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
# must be strings.
if cat in self.conditions and str(val) in self.conditions[cat]:
# categorize the conditional variables which are now in play
for p in self.conditions[cat][str(val)]:
if len(self.parameters[p].range) == 1:
new_constants.append((p, self.parameters[p].range[0]))
elif self.parameters[p].is_categorical:
new_free_cats.append(p)
else:
new_tunables.append((p, self.parameters[p]))
new_params = self.conditions[cat][str(val)]
cons, cats, tuns = self._sort_parameters(new_params)
new_constants = constants + cons
new_free_cats = free_cats + cats
new_tunables = tunables + tuns

# recurse with the newly qualified categorical as a constant
parts.extend(self._enumerate(fixed_cats=new_fixed_cats,
Expand All @@ -118,3 +253,10 @@ def _enumerate(self, fixed_cats, constants, free_cats, tunables):
tunables=new_tunables))

return parts

def get_hyperpartitions(self):
"""
Traverse the CPT and enumerate all possible hyperpartitions of
categorical parameters for this method.
"""
return self._enumerate([], *self._sort_parameters(self.root_params))
Loading

0 comments on commit 4f9b483

Please sign in to comment.