diff --git a/README.md b/README.md index d42f7f1..25b0391 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ pip install -e DEHB # -e stands for editable, lets you modify the code and reru * [01 - Using DEHB to optimize 4 hyperparameters of a Scikit-learn's Random Forest on a classification dataset](examples/01_Optimizing_RandomForest_using_DEHB.ipynb) * [02 - Optimizing Scikit-learn's Random Forest without using ConfigSpace to represent the hyperparameter space](examples/02_using%20DEHB_without_ConfigSpace.ipynb) * [03 - Hyperparameter Optimization for MNIST in PyTorch](examples/03_pytorch_mnist_hpo.py) +* [04 - A generic template to use MODEHB for multi-objectives Hyperparameter Optimization](examples/04_mo_pytorch_mnist_hpo.py) To run PyTorch example: (*note additional requirements*) ```bash @@ -85,7 +86,8 @@ python examples/03_pytorch_mnist_hpo.py --min_budget 1 --max_budget 3 \ --verbose --runtime 60 --scheduler_file dask_dump/scheduler.json ``` - +### Running DEHB to optimize multiple objectives +To run multi-objective optimization we require 1 extra parameter mo_strategy: we provide MO-optimization using Non-dominated sorted (NDS) with crowding distance (NSGA-II) and NDS with eps-net(EPSNET). Find 04_mo_pytorch_mnist_hpo.py example to help you to get started ### DEHB Hyperparameters @@ -126,3 +128,10 @@ represents the *mutation* strategy while `bin` represents the *binomial crossove editor = {Z. Zhou}, year = {2021} } + +@online{Awad-arXiv-2023, +title = {MO-DEHB: Evolutionary-based Hyperband for Multi-Objective Optimization}, +author = {Noor Awad and Ayushi Sharma and Frank Hutter}, +year = {2023}, +keywords = {} +} diff --git a/examples/04_mo_pytorch_mnist_hpo.py b/examples/04_mo_pytorch_mnist_hpo.py new file mode 100644 index 0000000..0b06795 --- /dev/null +++ b/examples/04_mo_pytorch_mnist_hpo.py @@ -0,0 +1,322 @@ +""" +This script runs a Multi-Objective Hyperparameter Optimisation using MODEHB to tune the architecture and +training hyperparameters for training a neural network on MNIST in PyTorch. It minimizes two objectives: loss and model size +This example is an extension of single objective problem:'03_pytorch_mnist_hpo.py' to multi-objective setting +Additional requirements: +* torch>=1.7.1 +* torchvision>=0.8.2 +* torchsummary>=1.5.1 + +PyTorch code referenced from: https://github.com/pytorch/examples/blob/master/mnist/main.py +""" + +import argparse +import os +import pickle +import time + +import ConfigSpace as CS +import ConfigSpace.hyperparameters as CSH +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torchvision +from distributed import Client +from torchsummary import summary +from torchvision import transforms + +from dehb import MODEHB + + +class Model(nn.Module): + def __init__(self, config, img_dim=28, output_dim=10): + super().__init__() + self.output_dim = output_dim + self.pool_kernel = 2 + self.pool_stride = 1 + self.maxpool = nn.MaxPool2d(self.pool_kernel, self.pool_stride) + self.conv1 = nn.Conv2d( + in_channels=1, + out_channels=config["channels_1"], + kernel_size=config["kernel_1"], + stride=config["stride_1"], + padding=0, + dilation=1 + ) + # updating image size after conv1 + img_dim = self._update_size(img_dim, config["kernel_1"], config["stride_1"], 0, 1) + self.conv2 = nn.Conv2d( + in_channels=config["channels_1"], + out_channels=config["channels_2"], + kernel_size=config["kernel_2"], + stride=config["stride_2"], + padding=0, + dilation=1 + ) + # updating image size after conv2 + img_dim = self._update_size(img_dim, config["kernel_2"], config["stride_2"], 0, 1) + # updating image size after maxpool + img_dim = self._update_size(img_dim, self.pool_kernel, self.pool_stride, 0, 1) + self.dropout = nn.Dropout(config["dropout"]) + hidden_dim = config["hidden"] + self.fc1 = nn.Linear(img_dim * img_dim * config["channels_2"], hidden_dim) + self.fc2 = nn.Linear(hidden_dim, self.output_dim) + + def forward(self, x): + # Layer 1 + x = self.conv1(x) + x = F.relu(x) + x = self.dropout(x) + # Layer 2 + x = self.conv2(x) + x = F.relu(x) + x = self.maxpool(x) + x = self.dropout(x) + # FC Layer 1 + x = torch.flatten(x, 1) + x = self.fc1(x) + # Output layer + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + def _update_size(self, dim, kernel_size, stride, padding, dilation): + return int(np.floor((dim + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1)) + + +def get_configspace(seed=None): + cs = CS.ConfigurationSpace(seed) + + # Hyperparameter defining first Conv layer + kernel1 = CSH.OrdinalHyperparameter("kernel_1", sequence=[3, 5, 7], default_value=5) + channels1 = CSH.UniformIntegerHyperparameter("channels_1", lower=3, upper=64, + default_value=32) + stride1 = CSH.UniformIntegerHyperparameter("stride_1", lower=1, upper=2, default_value=1) + cs.add_hyperparameters([kernel1, channels1, stride1]) + + # Hyperparameter defining second Conv layer + kernel2 = CSH.OrdinalHyperparameter("kernel_2", sequence=[3, 5, 7], default_value=5) + channels2 = CSH.UniformIntegerHyperparameter("channels_2", lower=3, upper=64, + default_value=32) + stride2 = CSH.UniformIntegerHyperparameter("stride_2", lower=1, upper=2, default_value=1) + cs.add_hyperparameters([kernel2, channels2, stride2]) + + # Hyperparameter for FC layer + hidden = CSH.UniformIntegerHyperparameter( + "hidden", lower=32, upper=256, log=True, default_value=128 + ) + cs.add_hyperparameter(hidden) + + # Regularization Hyperparameter + dropout = CSH.UniformFloatHyperparameter("dropout", lower=0, upper=0.5, default_value=0.1) + cs.add_hyperparameter(dropout) + + # Training Hyperparameters + batch_size = CSH.OrdinalHyperparameter( + "batch_size", sequence=[2, 4, 8, 16, 32, 64], default_value=4 + ) + lr = CSH.UniformFloatHyperparameter("lr", lower=1e-6, upper=0.1, log=True, + default_value=1e-3) + cs.add_hyperparameters([batch_size, lr]) + return cs + + +def train(model, device, train_loader, optimizer): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + + +def evaluate(model, device, data_loader, acc=False): + model.eval() + loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in data_loader: + data, target = data.to(device), target.to(device) + output = model(data) + loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + loss /= len(data_loader.dataset) + correct /= len(data_loader.dataset) + + if acc: + return correct + return loss + + +def train_and_evaluate(config, max_budget, verbose=False, **kwargs): + device = kwargs["device"] + batch_size = config["batch_size"] + train_set = kwargs["train_set"] + test_set = kwargs["test_set"] + train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) + test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) + model = Model(config).to(device) + optimizer = optim.Adadelta(model.parameters(), lr=config["lr"]) + for epoch in range(1, int(max_budget) + 1): + train(model, device, train_loader, optimizer) + accuracy = evaluate(model, device, test_loader, acc=True) + num_params = np.log(np.sum(p.numel() for p in model.parameters())) + if verbose: + summary(model, (1, 28, 28)) # image dimensions for MNIST + return [accuracy, num_params] + + +def objective_function(config, budget, **kwargs): + """ The target function to minimize for HPO""" + device = kwargs["device"] + + # Data Loaders + batch_size = config["batch_size"] + train_set = kwargs["train_set"] + valid_set = kwargs["valid_set"] + test_set = kwargs["test_set"] + train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) + valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) + test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) + + # Build model + model = Model(config).to(device) + + # Optimizer + optimizer = optim.Adadelta(model.parameters(), lr=config["lr"]) + + start = time.time() # measuring wallclock time + for epoch in range(1, int(budget) + 1): + train(model, device, train_loader, optimizer) + loss = evaluate(model, device, valid_loader) + cost = time.time() - start + + # not including test score computation in the `cost` + test_loss = evaluate(model, device, test_loader) + + # get number of model parameters + num_params = np.log(np.sum(p.numel() for p in model.parameters())) + + # dict representation that DEHB requires + res = { + "fitness": [loss, num_params], + "cost": cost, + "info": {"test_loss": test_loss, "budget": budget} + } + return res + + +def input_arguments(): + parser = argparse.ArgumentParser(description='Optimizing MNIST in PyTorch using DEHB.') + parser.add_argument('--no_cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=123, metavar='S', + help='random seed (default: 123)') + parser.add_argument('--refit_training', action='store_true', default=False, + help='Refit with incumbent configuration on full training data and budget') + parser.add_argument('--min_budget', type=float, default=1, + help='Minimum budget (epoch length)') + parser.add_argument('--max_budget', type=float, default=25, + help='Maximum budget (epoch length)') + parser.add_argument('--eta', type=int, default=3, + help='Parameter for Hyperband controlling early stopping aggressiveness') + parser.add_argument('--output_path', type=str, default="./pytorch_mnist_dehb", + help='Directory for DEHB to write logs and outputs') + parser.add_argument('--scheduler_file', type=str, default=None, + help='The file to connect a Dask client with a Dask scheduler') + parser.add_argument('--n_workers', type=int, default=1, + help='Number of CPU workers for DEHB to distribute function evaluations to') + parser.add_argument('--single_node_with_gpus', default=False, action="store_true", + help='If True, signals the DEHB run to assume all required GPUs are on ' + 'the same node/machine. To be specified as True if no client is ' + 'passed and n_workers > 1. Should be set to False if a client is ' + 'specified as a scheduler-file created. The onus of GPU usage is then' + 'on the Dask workers created and mapped to the scheduler-file.') + mo_strategy_choices = ['EPSNET', 'NSGA-II'] + parser.add_argument('--mo_strategy', default="EPSNET", choices=mo_strategy_choices, + type=str, nargs='?', + help="specify the multiobjective strategy from among {}".format(mo_strategy_choices)) + parser.add_argument('--verbose', action="store_true", default=False, + help='Decides verbosity of DEHB optimization') + parser.add_argument('--runtime', type=float, default=300, + help='Total time in seconds as budget to run DEHB') + args = parser.parse_args() + return args + + +def main(): + args = input_arguments() + + use_cuda = not args.no_cuda and torch.cuda.is_available() + device = torch.device("cuda" if use_cuda else "cpu") + + torch.manual_seed(args.seed) + + # Data Preparation + transform = transforms.Compose([ + transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) + ]) + train_set = torchvision.datasets.MNIST( + root='./data', train=True, download=True, transform=transform + ) + train_set, valid_set = torch.utils.data.random_split(train_set, [50000, 10000]) + test_set = torchvision.datasets.MNIST( + root='./data', train=False, download=True, transform=transform + ) + + # Get configuration space + cs = get_configspace(args.seed) + dimensions = len(cs.get_hyperparameters()) + + # Some insights into Dask interfaces to DEHB and handling GPU devices for parallelism: + # * if args.scheduler_file is specified, args.n_workers need not be specifed --- since + # args.scheduler_file indicates a Dask client/server is active + # * if args.scheduler_file is not specified and args.n_workers > 1 --- the DEHB object + # creates a Dask client as at instantiation and dies with the associated DEHB object + # * if args.single_node_with_gpus is True --- assumes that all GPU devices indicated + # through the environment variable "CUDA_VISIBLE_DEVICES" resides on the same machine + + # Dask checks and setups + single_node_with_gpus = args.single_node_with_gpus + if args.scheduler_file is not None and os.path.isfile(args.scheduler_file): + client = Client(scheduler_file=args.scheduler_file) + # explicitly delegating GPU handling to Dask workers defined + single_node_with_gpus = False + else: + client = None + + ########################### + # DEHB optimisation block # + ########################### + np.random.seed(args.seed) + modehb = MODEHB(objective_function=objective_function, cs=cs, dimensions=dimensions, min_budget=args.min_budget, + max_budget=args.max_budget, eta=args.eta, output_path=args.output_path, + num_objectives=2, mo_strategy=args.mo_strategy, + # if client is not None and of type Client, n_workers is ignored + # if client is None, a Dask client with n_workers is set up + client=client, n_workers=args.n_workers) + runtime, history = modehb.run(total_cost=args.runtime, verbose=args.verbose, + # arguments below are part of **kwargs shared across workers + train_set=train_set, valid_set=valid_set, test_set=test_set, + single_node_with_gpus=single_node_with_gpus, device=device) + # end of DEHB optimisation + + # Saving optimisation trace history + name = time.strftime("%x %X %Z", time.localtime(modehb.start)) + name = name.replace("/", '-').replace(":", '-').replace(" ", '_') + modehb.logger.info("Saving optimisation trace history...") + with open(os.path.join(args.output_path, "history_{}.pkl".format(name)), "wb") as f: + pickle.dump(history, f) + modehb.logger.info("pareto population:{}", modehb.pareto_pop) + modehb.logger.info("pareto fitness:{}", modehb.pareto_fit) + modehb.logger.debug("runtime:{}", runtime) + + +if __name__ == "__main__": + main()