Skip to content

Commit

Permalink
Merge pull request #38 from automl/master
Browse files Browse the repository at this point in the history
Get development up to date with master
  • Loading branch information
Bronzila committed Jul 11, 2023
2 parents d9da120 + 20a6b53 commit 54ce41c
Show file tree
Hide file tree
Showing 2 changed files with 332 additions and 1 deletion.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pip install -e DEHB # -e stands for editable, lets you modify the code and reru
* [01 - Using DEHB to optimize 4 hyperparameters of a Scikit-learn's Random Forest on a classification dataset](examples/01_Optimizing_RandomForest_using_DEHB.ipynb)
* [02 - Optimizing Scikit-learn's Random Forest without using ConfigSpace to represent the hyperparameter space](examples/02_using%20DEHB_without_ConfigSpace.ipynb)
* [03 - Hyperparameter Optimization for MNIST in PyTorch](examples/03_pytorch_mnist_hpo.py)
* [04 - A generic template to use MODEHB for multi-objectives Hyperparameter Optimization](examples/04_mo_pytorch_mnist_hpo.py)

To run PyTorch example: (*note additional requirements*)
```bash
Expand Down Expand Up @@ -85,7 +86,8 @@ python examples/03_pytorch_mnist_hpo.py --min_budget 1 --max_budget 3 \
--verbose --runtime 60 --scheduler_file dask_dump/scheduler.json
```


### Running DEHB to optimize multiple objectives
To run multi-objective optimization we require 1 extra parameter mo_strategy: we provide MO-optimization using Non-dominated sorted (NDS) with crowding distance (NSGA-II) and NDS with eps-net(EPSNET). Find 04_mo_pytorch_mnist_hpo.py example to help you to get started

### DEHB Hyperparameters

Expand Down Expand Up @@ -126,3 +128,10 @@ represents the *mutation* strategy while `bin` represents the *binomial crossove
editor = {Z. Zhou},
year = {2021}
}
@online{Awad-arXiv-2023,
title = {MO-DEHB: Evolutionary-based Hyperband for Multi-Objective Optimization},
author = {Noor Awad and Ayushi Sharma and Frank Hutter},
year = {2023},
keywords = {}
}
322 changes: 322 additions & 0 deletions examples/04_mo_pytorch_mnist_hpo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
"""
This script runs a Multi-Objective Hyperparameter Optimisation using MODEHB to tune the architecture and
training hyperparameters for training a neural network on MNIST in PyTorch. It minimizes two objectives: loss and model size
This example is an extension of single objective problem:'03_pytorch_mnist_hpo.py' to multi-objective setting
Additional requirements:
* torch>=1.7.1
* torchvision>=0.8.2
* torchsummary>=1.5.1
PyTorch code referenced from: https://github.com/pytorch/examples/blob/master/mnist/main.py
"""

import argparse
import os
import pickle
import time

import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from distributed import Client
from torchsummary import summary
from torchvision import transforms

from dehb import MODEHB


class Model(nn.Module):
def __init__(self, config, img_dim=28, output_dim=10):
super().__init__()
self.output_dim = output_dim
self.pool_kernel = 2
self.pool_stride = 1
self.maxpool = nn.MaxPool2d(self.pool_kernel, self.pool_stride)
self.conv1 = nn.Conv2d(
in_channels=1,
out_channels=config["channels_1"],
kernel_size=config["kernel_1"],
stride=config["stride_1"],
padding=0,
dilation=1
)
# updating image size after conv1
img_dim = self._update_size(img_dim, config["kernel_1"], config["stride_1"], 0, 1)
self.conv2 = nn.Conv2d(
in_channels=config["channels_1"],
out_channels=config["channels_2"],
kernel_size=config["kernel_2"],
stride=config["stride_2"],
padding=0,
dilation=1
)
# updating image size after conv2
img_dim = self._update_size(img_dim, config["kernel_2"], config["stride_2"], 0, 1)
# updating image size after maxpool
img_dim = self._update_size(img_dim, self.pool_kernel, self.pool_stride, 0, 1)
self.dropout = nn.Dropout(config["dropout"])
hidden_dim = config["hidden"]
self.fc1 = nn.Linear(img_dim * img_dim * config["channels_2"], hidden_dim)
self.fc2 = nn.Linear(hidden_dim, self.output_dim)

def forward(self, x):
# Layer 1
x = self.conv1(x)
x = F.relu(x)
x = self.dropout(x)
# Layer 2
x = self.conv2(x)
x = F.relu(x)
x = self.maxpool(x)
x = self.dropout(x)
# FC Layer 1
x = torch.flatten(x, 1)
x = self.fc1(x)
# Output layer
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output

def _update_size(self, dim, kernel_size, stride, padding, dilation):
return int(np.floor((dim + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1))


def get_configspace(seed=None):
cs = CS.ConfigurationSpace(seed)

# Hyperparameter defining first Conv layer
kernel1 = CSH.OrdinalHyperparameter("kernel_1", sequence=[3, 5, 7], default_value=5)
channels1 = CSH.UniformIntegerHyperparameter("channels_1", lower=3, upper=64,
default_value=32)
stride1 = CSH.UniformIntegerHyperparameter("stride_1", lower=1, upper=2, default_value=1)
cs.add_hyperparameters([kernel1, channels1, stride1])

# Hyperparameter defining second Conv layer
kernel2 = CSH.OrdinalHyperparameter("kernel_2", sequence=[3, 5, 7], default_value=5)
channels2 = CSH.UniformIntegerHyperparameter("channels_2", lower=3, upper=64,
default_value=32)
stride2 = CSH.UniformIntegerHyperparameter("stride_2", lower=1, upper=2, default_value=1)
cs.add_hyperparameters([kernel2, channels2, stride2])

# Hyperparameter for FC layer
hidden = CSH.UniformIntegerHyperparameter(
"hidden", lower=32, upper=256, log=True, default_value=128
)
cs.add_hyperparameter(hidden)

# Regularization Hyperparameter
dropout = CSH.UniformFloatHyperparameter("dropout", lower=0, upper=0.5, default_value=0.1)
cs.add_hyperparameter(dropout)

# Training Hyperparameters
batch_size = CSH.OrdinalHyperparameter(
"batch_size", sequence=[2, 4, 8, 16, 32, 64], default_value=4
)
lr = CSH.UniformFloatHyperparameter("lr", lower=1e-6, upper=0.1, log=True,
default_value=1e-3)
cs.add_hyperparameters([batch_size, lr])
return cs


def train(model, device, train_loader, optimizer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()


def evaluate(model, device, data_loader, acc=False):
model.eval()
loss = 0
correct = 0
with torch.no_grad():
for data, target in data_loader:
data, target = data.to(device), target.to(device)
output = model(data)
loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

loss /= len(data_loader.dataset)
correct /= len(data_loader.dataset)

if acc:
return correct
return loss


def train_and_evaluate(config, max_budget, verbose=False, **kwargs):
device = kwargs["device"]
batch_size = config["batch_size"]
train_set = kwargs["train_set"]
test_set = kwargs["test_set"]
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)
model = Model(config).to(device)
optimizer = optim.Adadelta(model.parameters(), lr=config["lr"])
for epoch in range(1, int(max_budget) + 1):
train(model, device, train_loader, optimizer)
accuracy = evaluate(model, device, test_loader, acc=True)
num_params = np.log(np.sum(p.numel() for p in model.parameters()))
if verbose:
summary(model, (1, 28, 28)) # image dimensions for MNIST
return [accuracy, num_params]


def objective_function(config, budget, **kwargs):
""" The target function to minimize for HPO"""
device = kwargs["device"]

# Data Loaders
batch_size = config["batch_size"]
train_set = kwargs["train_set"]
valid_set = kwargs["valid_set"]
test_set = kwargs["test_set"]
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

# Build model
model = Model(config).to(device)

# Optimizer
optimizer = optim.Adadelta(model.parameters(), lr=config["lr"])

start = time.time() # measuring wallclock time
for epoch in range(1, int(budget) + 1):
train(model, device, train_loader, optimizer)
loss = evaluate(model, device, valid_loader)
cost = time.time() - start

# not including test score computation in the `cost`
test_loss = evaluate(model, device, test_loader)

# get number of model parameters
num_params = np.log(np.sum(p.numel() for p in model.parameters()))

# dict representation that DEHB requires
res = {
"fitness": [loss, num_params],
"cost": cost,
"info": {"test_loss": test_loss, "budget": budget}
}
return res


def input_arguments():
parser = argparse.ArgumentParser(description='Optimizing MNIST in PyTorch using DEHB.')
parser.add_argument('--no_cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=123, metavar='S',
help='random seed (default: 123)')
parser.add_argument('--refit_training', action='store_true', default=False,
help='Refit with incumbent configuration on full training data and budget')
parser.add_argument('--min_budget', type=float, default=1,
help='Minimum budget (epoch length)')
parser.add_argument('--max_budget', type=float, default=25,
help='Maximum budget (epoch length)')
parser.add_argument('--eta', type=int, default=3,
help='Parameter for Hyperband controlling early stopping aggressiveness')
parser.add_argument('--output_path', type=str, default="./pytorch_mnist_dehb",
help='Directory for DEHB to write logs and outputs')
parser.add_argument('--scheduler_file', type=str, default=None,
help='The file to connect a Dask client with a Dask scheduler')
parser.add_argument('--n_workers', type=int, default=1,
help='Number of CPU workers for DEHB to distribute function evaluations to')
parser.add_argument('--single_node_with_gpus', default=False, action="store_true",
help='If True, signals the DEHB run to assume all required GPUs are on '
'the same node/machine. To be specified as True if no client is '
'passed and n_workers > 1. Should be set to False if a client is '
'specified as a scheduler-file created. The onus of GPU usage is then'
'on the Dask workers created and mapped to the scheduler-file.')
mo_strategy_choices = ['EPSNET', 'NSGA-II']
parser.add_argument('--mo_strategy', default="EPSNET", choices=mo_strategy_choices,
type=str, nargs='?',
help="specify the multiobjective strategy from among {}".format(mo_strategy_choices))
parser.add_argument('--verbose', action="store_true", default=False,
help='Decides verbosity of DEHB optimization')
parser.add_argument('--runtime', type=float, default=300,
help='Total time in seconds as budget to run DEHB')
args = parser.parse_args()
return args


def main():
args = input_arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

torch.manual_seed(args.seed)

# Data Preparation
transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))
])
train_set = torchvision.datasets.MNIST(
root='./data', train=True, download=True, transform=transform
)
train_set, valid_set = torch.utils.data.random_split(train_set, [50000, 10000])
test_set = torchvision.datasets.MNIST(
root='./data', train=False, download=True, transform=transform
)

# Get configuration space
cs = get_configspace(args.seed)
dimensions = len(cs.get_hyperparameters())

# Some insights into Dask interfaces to DEHB and handling GPU devices for parallelism:
# * if args.scheduler_file is specified, args.n_workers need not be specifed --- since
# args.scheduler_file indicates a Dask client/server is active
# * if args.scheduler_file is not specified and args.n_workers > 1 --- the DEHB object
# creates a Dask client as at instantiation and dies with the associated DEHB object
# * if args.single_node_with_gpus is True --- assumes that all GPU devices indicated
# through the environment variable "CUDA_VISIBLE_DEVICES" resides on the same machine

# Dask checks and setups
single_node_with_gpus = args.single_node_with_gpus
if args.scheduler_file is not None and os.path.isfile(args.scheduler_file):
client = Client(scheduler_file=args.scheduler_file)
# explicitly delegating GPU handling to Dask workers defined
single_node_with_gpus = False
else:
client = None

###########################
# DEHB optimisation block #
###########################
np.random.seed(args.seed)
modehb = MODEHB(objective_function=objective_function, cs=cs, dimensions=dimensions, min_budget=args.min_budget,
max_budget=args.max_budget, eta=args.eta, output_path=args.output_path,
num_objectives=2, mo_strategy=args.mo_strategy,
# if client is not None and of type Client, n_workers is ignored
# if client is None, a Dask client with n_workers is set up
client=client, n_workers=args.n_workers)
runtime, history = modehb.run(total_cost=args.runtime, verbose=args.verbose,
# arguments below are part of **kwargs shared across workers
train_set=train_set, valid_set=valid_set, test_set=test_set,
single_node_with_gpus=single_node_with_gpus, device=device)
# end of DEHB optimisation

# Saving optimisation trace history
name = time.strftime("%x %X %Z", time.localtime(modehb.start))
name = name.replace("/", '-').replace(":", '-').replace(" ", '_')
modehb.logger.info("Saving optimisation trace history...")
with open(os.path.join(args.output_path, "history_{}.pkl".format(name)), "wb") as f:
pickle.dump(history, f)
modehb.logger.info("pareto population:{}", modehb.pareto_pop)
modehb.logger.info("pareto fitness:{}", modehb.pareto_fit)
modehb.logger.debug("runtime:{}", runtime)


if __name__ == "__main__":
main()

0 comments on commit 54ce41c

Please sign in to comment.