Skip to content

Commit

Permalink
Adding utility functions for checkpoints and the lightning template
Browse files Browse the repository at this point in the history
  • Loading branch information
TarekAbouChakra committed Oct 19, 2023
1 parent d75430d commit 61103db
Show file tree
Hide file tree
Showing 4 changed files with 339 additions and 74 deletions.
61 changes: 11 additions & 50 deletions neps_examples/convenience/neps_x_lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from torchvision.transforms import transforms

import neps
from neps.utils.common import get_initial_directory, load_lightning_checkpoint

#############################################################
# Definig the seeds for reproducibility
Expand All @@ -70,39 +71,6 @@ def set_seed(seed=123):
random.seed(seed)


#############################################################
# Function to get the initial directory used for storing tfevent files and
# checkpoints


def initial_directory(pipeline_directory: Path) -> Path:
"""
Find the initial directory based on its existence and the presence of
the "previous_config.id" file.
Args:
pipeline_directory (Path): The starting directory to search from.
Returns:
Path: The initial directory.
"""
while True:
# Get the id of the previous directory
previous_pipeline_directory_id = pipeline_directory / "previous_config.id"

# Get the directory where all configs are saved
optim_result_dir = pipeline_directory.parent

if previous_pipeline_directory_id.exists():
# Get and join to the previous path according to the id
with open(previous_pipeline_directory_id) as config_id_file:
id = config_id_file.read()
pipeline_directory = optim_result_dir / f"config_{id}"
else:
# Initial directory found
return pipeline_directory


#############################################################
# Define the lightning model

Expand Down Expand Up @@ -299,7 +267,7 @@ def search_space() -> dict:

def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
# Initialize the first directory to store the event and checkpoints files
init_dir = initial_directory(pipeline_directory)
init_dir = get_initial_directory(pipeline_directory)
checkpoint_dir = init_dir / "checkpoints"

# Initialize the model and checkpoint dir
Expand All @@ -316,23 +284,16 @@ def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> d
filename="{epoch}-{val_loss:.2f}",
)

# Initialize variables for checkpoint tracking progress
previously_spent_epochs = 0
checkpoint_path = None

if previous_pipeline_directory:
# Search for possible checkpoints to continue training
ckpt_files = glob.glob(str(checkpoint_dir / "*.ckpt"))
# Use this function to load the previous checkpoint if it exists
checkpoint_path, checkpoint = load_lightning_checkpoint(
previous_pipeline_directory=previous_pipeline_directory,
checkpoint_dir=checkpoint_dir,
)

if ckpt_files:
# Load the checkpoint and retrieve necessary data
checkpoint_path = ckpt_files[-1]
checkpoint = torch.load(checkpoint_path)
previously_spent_epochs = checkpoint["epoch"]
else:
raise FileNotFoundError(
"No checkpoint files were located in the checkpoint directory"
)
if checkpoint is None:
previously_spent_epochs = 0
else:
previously_spent_epochs = checkpoint["epoch"]

# Create a PyTorch Lightning Trainer
epochs = config["epochs"]
Expand Down
154 changes: 154 additions & 0 deletions neps_examples/template/lightning_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""
This code is not runnable but should serve as a guide to a successful neps run
using using pytorch lightning and priorband as a searcher.
Steps:
1. Create search space with a fidelity parameter.
2. Create run_pipeline which includes:
A. Start by getting the initial directory, which will be used to store TensorBoard
event files and checkpoint files.
B. Initialize the lightning model.
C. Create the TensorBoard logger and the checkpoint callback.
D. Check for any existing checkpoint files and load checkpoint data.
E. Create a PyTorch Lightning Trainer.
F. Train the model, calculate metrics, and test the model.
3. Use neps.run and specify "priorband" as the searcher.
For a more detailed guide, please refer to:
https://github.com/automl/neps/blob/master/neps_examples/convenience/neps_x_lightning.py
"""
import logging

import lightning as L
import torch
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

import neps
from neps.utils.common import get_initial_directory, load_lightning_checkpoint

# 1. Create the pipeline_space


def pipeline_space() -> dict:
# Define a dictionary to represent the hyperparameter search space
space = dict(
lr=neps.FloatParameter(lower=1e-5, upper=1e-2, log=True, default=1e-3),
optimizer=neps.CategoricalParameter(choices=["Adam", "SGD"], default="Adam"),
epochs=neps.IntegerParameter(lower=1, upper=9, log=False, is_fidelity=True),
)
return space


# 2. Create the lightning module


class LitModel(L.LightningModule):
def __init__(self, configuration: dict):
super().__init__()

self.save_hyperparameters(configuration)

# You can now define your criterion, transforms, model layers, and
# metrics obtained during trainig that configuration

def forward(self, x: torch.Tensor) -> torch.Tensor:
# Forward pass function
pass

def training_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
# Training step function
# Training metric of choice
pass

def validation_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
# Validation step function
# Validation metric of choice
pass

def test_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
# Test step function
# Test metric of choice
pass

def configure_optimizers(self) -> torch.optim.Optimizer:
# Define the optimizer base on the configuration
if self.hparams.optimizer == "Adam":
optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
elif self.hparams.optimizer == "SGD":
optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr)
else:
raise ValueError(f"{self.hparams.optimizer} is not a valid optimizer")
return optimizer

# Here one can now configure the dataloaders for the model
# Further details can be found here:
# https://lightning.ai/docs/pytorch/stable/data/datamodule.html
# https://github.com/automl/neps/blob/master/neps_examples/convenience/neps_x_lightning.py


# 3. Define the run pipeline function


def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
# A. Start by getting the initial directory which will be used to store tensorboard
# event files and checkpoint files
init_dir = get_initial_directory(pipeline_directory)
checkpoint_dir = init_dir / "checkpoints"
tensorboard_dir = init_dir / "tensorboard"

# B. Create the model
model = LitModel(config)

# C. Create the TensorBoard logger and the checkpoint callback
logger = TensorBoardLogger(
save_dir=tensorboard_dir, name="data", version="logs", default_hp_metric=False
)
checkpoint_callback = ModelCheckpoint(dirpath=checkpoint_dir)

# D. Checking for any checkpoint files and checkpoint data returns None if
# no checkpoint files exist.
checkpoint_path, checkpoint_data = load_lightning_checkpoint(
previous_pipeline_directory=previous_pipeline_directory,
checkpoint_dir=checkpoint_dir,
)

# E. Create a PyTorch Lightning Trainer
epochs = config["epochs"]

trainer = L.Trainer(
logger=logger,
max_epochs=epochs,
callbacks=[checkpoint_callback],
)

# F. Train, test, and their corresponding metrics
if checkpoint_path:
trainer.fit(model, ckpt_path=checkpoint_path)
else:
trainer.fit(model)
val_loss = trainer.logged_metrics.get("val_loss", None)

trainer.test(model)
test_loss = trainer.logged_metrics.get("test_loss", None)

return {
"loss": val_loss,
"info_dict": {
"test_loss": test_loss,
},
}


# 4. Define the neps.run function with the searcher as the argument

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

neps.run(
run_pipeline=run_pipeline,
pipeline_space=pipeline_space(),
root_directory="results",
max_evaluations_total=15,
searcher="priorband",
)
46 changes: 24 additions & 22 deletions neps_examples/template/priorband_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import torch.nn.functional as F

import neps
from neps.utils.common import load_checkpoint, save_checkpoint


class my_model(nn.Module):
Expand Down Expand Up @@ -50,11 +51,9 @@ def pipeline_space() -> dict:
return space


# NOTE: The order of the arguments in the run_pipeline function is important.
def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
# 1. Create your checkpoint directory
checkpoint_path = f"{previous_pipeline_directory}/checkpoint"

# 2. Create your model and the optimizer according to the coniguration
# 1. Create your model and the optimizer according to the coniguration
model = my_model()

if config["optimizer"] == "Adam":
Expand All @@ -70,31 +69,34 @@ def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> d
"Optimizer choices are defined differently in the pipeline_space"
)

# 3. Load the checkpoint states if it exists
if os.path.exists(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
epoch_already_trained = checkpoint["epoch"]
print(f"Read in model trained for {epoch_already_trained} epochs")
# 2. Load the checkpoints if they exist from previous_pipeline_directory
loaded_values = load_checkpoint(
previous_pipeline_directory=previous_pipeline_directory,
model=model,
optimizer=optimizer,
)

if loaded_values is not None:
epoch_already_trained = loaded_values["epochs"]
# + Anything else saved in the checkpoint.
else:
epoch_already_trained = 0
# + Anything else with default value.

# 4. Train or continue training the model based on the specified checkpoint
for epoch in range(epoch_already_trained, config["epochs"]):
# 3. Train or continue training the model based on the specified checkpoint
max_epochs = config["epochs"]
for epoch in range(epoch_already_trained, max_epochs):
val_loss = 0

# 5. Save the checkpoint data in the current directory
torch.save(
{
"epoch": config["epochs"],
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
f"{pipeline_directory}/checkpoint",
# 4. Save the checkpoint data in the current directory
save_checkpoint(
pipeline_directory=pipeline_directory,
values_to_save={"epochs": max_epochs},
model=model,
optimizer=optimizer,
)

# 6. Return a dictionary with the results, or a single float value (loss)
# 5. Return a dictionary with the results, or a single float value (loss)
return {
"loss": val_loss,
"info_dict": {
Expand Down
Loading

0 comments on commit 61103db

Please sign in to comment.