Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get development up to date with master #38

Merged
merged 4 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pip install -e DEHB # -e stands for editable, lets you modify the code and reru
* [01 - Using DEHB to optimize 4 hyperparameters of a Scikit-learn's Random Forest on a classification dataset](examples/01_Optimizing_RandomForest_using_DEHB.ipynb)
* [02 - Optimizing Scikit-learn's Random Forest without using ConfigSpace to represent the hyperparameter space](examples/02_using%20DEHB_without_ConfigSpace.ipynb)
* [03 - Hyperparameter Optimization for MNIST in PyTorch](examples/03_pytorch_mnist_hpo.py)
* [04 - A generic template to use MODEHB for multi-objectives Hyperparameter Optimization](examples/04_mo_pytorch_mnist_hpo.py)

To run PyTorch example: (*note additional requirements*)
```bash
Expand Down Expand Up @@ -81,7 +82,8 @@ python examples/03_pytorch_mnist_hpo.py --min_budget 1 --max_budget 3 \
--verbose --runtime 60 --scheduler_file dask_dump/scheduler.json
```


### Running DEHB to optimize multiple objectives
To run multi-objective optimization we require 1 extra parameter mo_strategy: we provide MO-optimization using Non-dominated sorted (NDS) with crowding distance (NSGA-II) and NDS with eps-net(EPSNET). Find 04_mo_pytorch_mnist_hpo.py example to help you to get started

### DEHB Hyperparameters

Expand Down Expand Up @@ -122,3 +124,10 @@ represents the *mutation* strategy while `bin` represents the *binomial crossove
editor = {Z. Zhou},
year = {2021}
}

@online{Awad-arXiv-2023,
title = {MO-DEHB: Evolutionary-based Hyperband for Multi-Objective Optimization},
author = {Noor Awad and Ayushi Sharma and Frank Hutter},
year = {2023},
keywords = {}
}
322 changes: 322 additions & 0 deletions examples/04_mo_pytorch_mnist_hpo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
"""
This script runs a Multi-Objective Hyperparameter Optimisation using MODEHB to tune the architecture and
training hyperparameters for training a neural network on MNIST in PyTorch. It minimizes two objectives: loss and model size
This example is an extension of single objective problem:'03_pytorch_mnist_hpo.py' to multi-objective setting
Additional requirements:
* torch>=1.7.1
* torchvision>=0.8.2
* torchsummary>=1.5.1

PyTorch code referenced from: https://github.com/pytorch/examples/blob/master/mnist/main.py
"""

import argparse
import os
import pickle
import time

import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from distributed import Client
from torchsummary import summary
from torchvision import transforms

from dehb import MODEHB


class Model(nn.Module):
def __init__(self, config, img_dim=28, output_dim=10):
super().__init__()
self.output_dim = output_dim
self.pool_kernel = 2
self.pool_stride = 1
self.maxpool = nn.MaxPool2d(self.pool_kernel, self.pool_stride)
self.conv1 = nn.Conv2d(
in_channels=1,
out_channels=config["channels_1"],
kernel_size=config["kernel_1"],
stride=config["stride_1"],
padding=0,
dilation=1
)
# updating image size after conv1
img_dim = self._update_size(img_dim, config["kernel_1"], config["stride_1"], 0, 1)
self.conv2 = nn.Conv2d(
in_channels=config["channels_1"],
out_channels=config["channels_2"],
kernel_size=config["kernel_2"],
stride=config["stride_2"],
padding=0,
dilation=1
)
# updating image size after conv2
img_dim = self._update_size(img_dim, config["kernel_2"], config["stride_2"], 0, 1)
# updating image size after maxpool
img_dim = self._update_size(img_dim, self.pool_kernel, self.pool_stride, 0, 1)
self.dropout = nn.Dropout(config["dropout"])
hidden_dim = config["hidden"]
self.fc1 = nn.Linear(img_dim * img_dim * config["channels_2"], hidden_dim)
self.fc2 = nn.Linear(hidden_dim, self.output_dim)

def forward(self, x):
# Layer 1
x = self.conv1(x)
x = F.relu(x)
x = self.dropout(x)
# Layer 2
x = self.conv2(x)
x = F.relu(x)
x = self.maxpool(x)
x = self.dropout(x)
# FC Layer 1
x = torch.flatten(x, 1)
x = self.fc1(x)
# Output layer
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output

def _update_size(self, dim, kernel_size, stride, padding, dilation):
return int(np.floor((dim + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1))


def get_configspace(seed=None):
cs = CS.ConfigurationSpace(seed)

# Hyperparameter defining first Conv layer
kernel1 = CSH.OrdinalHyperparameter("kernel_1", sequence=[3, 5, 7], default_value=5)
channels1 = CSH.UniformIntegerHyperparameter("channels_1", lower=3, upper=64,
default_value=32)
stride1 = CSH.UniformIntegerHyperparameter("stride_1", lower=1, upper=2, default_value=1)
cs.add_hyperparameters([kernel1, channels1, stride1])

# Hyperparameter defining second Conv layer
kernel2 = CSH.OrdinalHyperparameter("kernel_2", sequence=[3, 5, 7], default_value=5)
channels2 = CSH.UniformIntegerHyperparameter("channels_2", lower=3, upper=64,
default_value=32)
stride2 = CSH.UniformIntegerHyperparameter("stride_2", lower=1, upper=2, default_value=1)
cs.add_hyperparameters([kernel2, channels2, stride2])

# Hyperparameter for FC layer
hidden = CSH.UniformIntegerHyperparameter(
"hidden", lower=32, upper=256, log=True, default_value=128
)
cs.add_hyperparameter(hidden)

# Regularization Hyperparameter
dropout = CSH.UniformFloatHyperparameter("dropout", lower=0, upper=0.5, default_value=0.1)
cs.add_hyperparameter(dropout)

# Training Hyperparameters
batch_size = CSH.OrdinalHyperparameter(
"batch_size", sequence=[2, 4, 8, 16, 32, 64], default_value=4
)
lr = CSH.UniformFloatHyperparameter("lr", lower=1e-6, upper=0.1, log=True,
default_value=1e-3)
cs.add_hyperparameters([batch_size, lr])
return cs


def train(model, device, train_loader, optimizer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()


def evaluate(model, device, data_loader, acc=False):
model.eval()
loss = 0
correct = 0
with torch.no_grad():
for data, target in data_loader:
data, target = data.to(device), target.to(device)
output = model(data)
loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

loss /= len(data_loader.dataset)
correct /= len(data_loader.dataset)

if acc:
return correct
return loss


def train_and_evaluate(config, max_budget, verbose=False, **kwargs):
device = kwargs["device"]
batch_size = config["batch_size"]
train_set = kwargs["train_set"]
test_set = kwargs["test_set"]
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)
model = Model(config).to(device)
optimizer = optim.Adadelta(model.parameters(), lr=config["lr"])
for epoch in range(1, int(max_budget) + 1):
train(model, device, train_loader, optimizer)
accuracy = evaluate(model, device, test_loader, acc=True)
num_params = np.log(np.sum(p.numel() for p in model.parameters()))
if verbose:
summary(model, (1, 28, 28)) # image dimensions for MNIST
return [accuracy, num_params]


def objective_function(config, budget, **kwargs):
""" The target function to minimize for HPO"""
device = kwargs["device"]

# Data Loaders
batch_size = config["batch_size"]
train_set = kwargs["train_set"]
valid_set = kwargs["valid_set"]
test_set = kwargs["test_set"]
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

# Build model
model = Model(config).to(device)

# Optimizer
optimizer = optim.Adadelta(model.parameters(), lr=config["lr"])

start = time.time() # measuring wallclock time
for epoch in range(1, int(budget) + 1):
train(model, device, train_loader, optimizer)
loss = evaluate(model, device, valid_loader)
cost = time.time() - start

# not including test score computation in the `cost`
test_loss = evaluate(model, device, test_loader)

# get number of model parameters
num_params = np.log(np.sum(p.numel() for p in model.parameters()))

# dict representation that DEHB requires
res = {
"fitness": [loss, num_params],
"cost": cost,
"info": {"test_loss": test_loss, "budget": budget}
}
return res


def input_arguments():
parser = argparse.ArgumentParser(description='Optimizing MNIST in PyTorch using DEHB.')
parser.add_argument('--no_cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=123, metavar='S',
help='random seed (default: 123)')
parser.add_argument('--refit_training', action='store_true', default=False,
help='Refit with incumbent configuration on full training data and budget')
parser.add_argument('--min_budget', type=float, default=1,
help='Minimum budget (epoch length)')
parser.add_argument('--max_budget', type=float, default=25,
help='Maximum budget (epoch length)')
parser.add_argument('--eta', type=int, default=3,
help='Parameter for Hyperband controlling early stopping aggressiveness')
parser.add_argument('--output_path', type=str, default="./pytorch_mnist_dehb",
help='Directory for DEHB to write logs and outputs')
parser.add_argument('--scheduler_file', type=str, default=None,
help='The file to connect a Dask client with a Dask scheduler')
parser.add_argument('--n_workers', type=int, default=1,
help='Number of CPU workers for DEHB to distribute function evaluations to')
parser.add_argument('--single_node_with_gpus', default=False, action="store_true",
help='If True, signals the DEHB run to assume all required GPUs are on '
'the same node/machine. To be specified as True if no client is '
'passed and n_workers > 1. Should be set to False if a client is '
'specified as a scheduler-file created. The onus of GPU usage is then'
'on the Dask workers created and mapped to the scheduler-file.')
mo_strategy_choices = ['EPSNET', 'NSGA-II']
parser.add_argument('--mo_strategy', default="EPSNET", choices=mo_strategy_choices,
type=str, nargs='?',
help="specify the multiobjective strategy from among {}".format(mo_strategy_choices))
parser.add_argument('--verbose', action="store_true", default=False,
help='Decides verbosity of DEHB optimization')
parser.add_argument('--runtime', type=float, default=300,
help='Total time in seconds as budget to run DEHB')
args = parser.parse_args()
return args


def main():
args = input_arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

torch.manual_seed(args.seed)

# Data Preparation
transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))
])
train_set = torchvision.datasets.MNIST(
root='./data', train=True, download=True, transform=transform
)
train_set, valid_set = torch.utils.data.random_split(train_set, [50000, 10000])
test_set = torchvision.datasets.MNIST(
root='./data', train=False, download=True, transform=transform
)

# Get configuration space
cs = get_configspace(args.seed)
dimensions = len(cs.get_hyperparameters())

# Some insights into Dask interfaces to DEHB and handling GPU devices for parallelism:
# * if args.scheduler_file is specified, args.n_workers need not be specifed --- since
# args.scheduler_file indicates a Dask client/server is active
# * if args.scheduler_file is not specified and args.n_workers > 1 --- the DEHB object
# creates a Dask client as at instantiation and dies with the associated DEHB object
# * if args.single_node_with_gpus is True --- assumes that all GPU devices indicated
# through the environment variable "CUDA_VISIBLE_DEVICES" resides on the same machine

# Dask checks and setups
single_node_with_gpus = args.single_node_with_gpus
if args.scheduler_file is not None and os.path.isfile(args.scheduler_file):
client = Client(scheduler_file=args.scheduler_file)
# explicitly delegating GPU handling to Dask workers defined
single_node_with_gpus = False
else:
client = None

###########################
# DEHB optimisation block #
###########################
np.random.seed(args.seed)
modehb = MODEHB(objective_function=objective_function, cs=cs, dimensions=dimensions, min_budget=args.min_budget,
max_budget=args.max_budget, eta=args.eta, output_path=args.output_path,
num_objectives=2, mo_strategy=args.mo_strategy,
# if client is not None and of type Client, n_workers is ignored
# if client is None, a Dask client with n_workers is set up
client=client, n_workers=args.n_workers)
runtime, history = modehb.run(total_cost=args.runtime, verbose=args.verbose,
# arguments below are part of **kwargs shared across workers
train_set=train_set, valid_set=valid_set, test_set=test_set,
single_node_with_gpus=single_node_with_gpus, device=device)
# end of DEHB optimisation

# Saving optimisation trace history
name = time.strftime("%x %X %Z", time.localtime(modehb.start))
name = name.replace("/", '-').replace(":", '-').replace(" ", '_')
modehb.logger.info("Saving optimisation trace history...")
with open(os.path.join(args.output_path, "history_{}.pkl".format(name)), "wb") as f:
pickle.dump(history, f)
modehb.logger.info("pareto population:{}", modehb.pareto_pop)
modehb.logger.info("pareto fitness:{}", modehb.pareto_fit)
modehb.logger.debug("runtime:{}", runtime)


if __name__ == "__main__":
main()
Loading