Adding utility functions for checkpoints and the lightning template

automl · Oct 19, 2023 · 61103db · 61103db
1 parent d75430d
commit 61103db
Show file tree

Hide file tree

Showing 4 changed files with 339 additions and 74 deletions.
diff --git a/neps_examples/convenience/neps_x_lightning.py b/neps_examples/convenience/neps_x_lightning.py
@@ -59,6 +59,7 @@
 from torchvision.transforms import transforms
 
 import neps
+from neps.utils.common import get_initial_directory, load_lightning_checkpoint
 
 #############################################################
 # Definig the seeds for reproducibility
@@ -70,39 +71,6 @@ def set_seed(seed=123):
     random.seed(seed)
 
 
-#############################################################
-# Function to get the initial directory used for storing tfevent files and
-# checkpoints
-
-
-def initial_directory(pipeline_directory: Path) -> Path:
-    """
-    Find the initial directory based on its existence and the presence of
-    the "previous_config.id" file.
-
-    Args:
-        pipeline_directory (Path): The starting directory to search from.
-
-    Returns:
-        Path: The initial directory.
-    """
-    while True:
-        # Get the id of the previous directory
-        previous_pipeline_directory_id = pipeline_directory / "previous_config.id"
-
-        # Get the directory where all configs are saved
-        optim_result_dir = pipeline_directory.parent
-
-        if previous_pipeline_directory_id.exists():
-            # Get and join to the previous path according to the id
-            with open(previous_pipeline_directory_id) as config_id_file:
-                id = config_id_file.read()
-                pipeline_directory = optim_result_dir / f"config_{id}"
-        else:
-            # Initial directory found
-            return pipeline_directory
-
-
 #############################################################
 # Define the lightning model
 
@@ -299,7 +267,7 @@ def search_space() -> dict:
 
 def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
     # Initialize the first directory to store the event and checkpoints files
-    init_dir = initial_directory(pipeline_directory)
+    init_dir = get_initial_directory(pipeline_directory)
     checkpoint_dir = init_dir / "checkpoints"
 
     # Initialize the model and checkpoint dir
@@ -316,23 +284,16 @@ def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> d
         filename="{epoch}-{val_loss:.2f}",
     )
 
-    # Initialize variables for checkpoint tracking progress
-    previously_spent_epochs = 0
-    checkpoint_path = None
-
-    if previous_pipeline_directory:
-        # Search for possible checkpoints to continue training
-        ckpt_files = glob.glob(str(checkpoint_dir / "*.ckpt"))
+    # Use this function to load the previous checkpoint if it exists
+    checkpoint_path, checkpoint = load_lightning_checkpoint(
+        previous_pipeline_directory=previous_pipeline_directory,
+        checkpoint_dir=checkpoint_dir,
+    )
 
-        if ckpt_files:
-            # Load the checkpoint and retrieve necessary data
-            checkpoint_path = ckpt_files[-1]
-            checkpoint = torch.load(checkpoint_path)
-            previously_spent_epochs = checkpoint["epoch"]
-        else:
-            raise FileNotFoundError(
-                "No checkpoint files were located in the checkpoint directory"
-            )
+    if checkpoint is None:
+        previously_spent_epochs = 0
+    else:
+        previously_spent_epochs = checkpoint["epoch"]
 
     # Create a PyTorch Lightning Trainer
     epochs = config["epochs"]

diff --git a/neps_examples/template/lightning_template.py b/neps_examples/template/lightning_template.py
@@ -0,0 +1,154 @@
+"""
+This code is not runnable but should serve as a guide to a successful neps run
+using using pytorch lightning and priorband as a searcher.
+
+Steps:
+1. Create search space with a fidelity parameter.
+2. Create run_pipeline which includes:
+    A. Start by getting the initial directory, which will be used to store TensorBoard
+       event files and checkpoint files.
+    B. Initialize the lightning model.
+    C. Create the TensorBoard logger and the checkpoint callback.
+    D. Check for any existing checkpoint files and load checkpoint data.
+    E. Create a PyTorch Lightning Trainer.
+    F. Train the model, calculate metrics, and test the model.
+3. Use neps.run and specify "priorband" as the searcher.
+
+For a more detailed guide, please refer to:
+https://github.com/automl/neps/blob/master/neps_examples/convenience/neps_x_lightning.py
+"""
+import logging
+
+import lightning as L
+import torch
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.loggers import TensorBoardLogger
+
+import neps
+from neps.utils.common import get_initial_directory, load_lightning_checkpoint
+
+# 1. Create the pipeline_space
+
+
+def pipeline_space() -> dict:
+    # Define a dictionary to represent the hyperparameter search space
+    space = dict(
+        lr=neps.FloatParameter(lower=1e-5, upper=1e-2, log=True, default=1e-3),
+        optimizer=neps.CategoricalParameter(choices=["Adam", "SGD"], default="Adam"),
+        epochs=neps.IntegerParameter(lower=1, upper=9, log=False, is_fidelity=True),
+    )
+    return space
+
+
+# 2. Create the lightning module
+
+
+class LitModel(L.LightningModule):
+    def __init__(self, configuration: dict):
+        super().__init__()
+
+        self.save_hyperparameters(configuration)
+
+        # You can now define your criterion, transforms, model layers, and
+        # metrics obtained during trainig that configuration
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Forward pass function
+        pass
+
+    def training_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
+        # Training step function
+        # Training metric of choice
+        pass
+
+    def validation_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
+        # Validation step function
+        # Validation metric of choice
+        pass
+
+    def test_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
+        # Test step function
+        # Test metric of choice
+        pass
+
+    def configure_optimizers(self) -> torch.optim.Optimizer:
+        # Define the optimizer base on the configuration
+        if self.hparams.optimizer == "Adam":
+            optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
+        elif self.hparams.optimizer == "SGD":
+            optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr)
+        else:
+            raise ValueError(f"{self.hparams.optimizer} is not a valid optimizer")
+        return optimizer
+
+    # Here one can now configure the dataloaders for the model
+    # Further details can be found here:
+    # https://lightning.ai/docs/pytorch/stable/data/datamodule.html
+    # https://github.com/automl/neps/blob/master/neps_examples/convenience/neps_x_lightning.py
+
+
+# 3. Define the run pipeline function
+
+
+def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
+    # A. Start by getting the initial directory which will be used to store tensorboard
+    # event files and checkpoint files
+    init_dir = get_initial_directory(pipeline_directory)
+    checkpoint_dir = init_dir / "checkpoints"
+    tensorboard_dir = init_dir / "tensorboard"
+
+    # B. Create the model
+    model = LitModel(config)
+
+    # C. Create the TensorBoard logger and the checkpoint callback
+    logger = TensorBoardLogger(
+        save_dir=tensorboard_dir, name="data", version="logs", default_hp_metric=False
+    )
+    checkpoint_callback = ModelCheckpoint(dirpath=checkpoint_dir)
+
+    # D. Checking for any checkpoint files and checkpoint data returns None if
+    # no checkpoint files exist.
+    checkpoint_path, checkpoint_data = load_lightning_checkpoint(
+        previous_pipeline_directory=previous_pipeline_directory,
+        checkpoint_dir=checkpoint_dir,
+    )
+
+    # E. Create a PyTorch Lightning Trainer
+    epochs = config["epochs"]
+
+    trainer = L.Trainer(
+        logger=logger,
+        max_epochs=epochs,
+        callbacks=[checkpoint_callback],
+    )
+
+    # F. Train, test, and their corresponding metrics
+    if checkpoint_path:
+        trainer.fit(model, ckpt_path=checkpoint_path)
+    else:
+        trainer.fit(model)
+    val_loss = trainer.logged_metrics.get("val_loss", None)
+
+    trainer.test(model)
+    test_loss = trainer.logged_metrics.get("test_loss", None)
+
+    return {
+        "loss": val_loss,
+        "info_dict": {
+            "test_loss": test_loss,
+        },
+    }
+
+
+# 4. Define the neps.run function with the searcher as the argument
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    neps.run(
+        run_pipeline=run_pipeline,
+        pipeline_space=pipeline_space(),
+        root_directory="results",
+        max_evaluations_total=15,
+        searcher="priorband",
+    )
diff --git a/neps_examples/template/priorband_template.py b/neps_examples/template/priorband_template.py
@@ -20,6 +20,7 @@
 import torch.nn.functional as F
 
 import neps
+from neps.utils.common import load_checkpoint, save_checkpoint
 
 
 class my_model(nn.Module):
@@ -50,11 +51,9 @@ def pipeline_space() -> dict:
     return space
 
 
+# NOTE: The order of the arguments in the run_pipeline function is important.
 def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> dict:
-    # 1. Create your checkpoint directory
-    checkpoint_path = f"{previous_pipeline_directory}/checkpoint"
-
-    # 2. Create your model and the optimizer according to the coniguration
+    # 1. Create your model and the optimizer according to the coniguration
     model = my_model()
 
     if config["optimizer"] == "Adam":
@@ -70,31 +69,34 @@ def run_pipeline(pipeline_directory, previous_pipeline_directory, **config) -> d
             "Optimizer choices are defined differently in the pipeline_space"
         )
 
-    # 3. Load the checkpoint states if it exists
-    if os.path.exists(checkpoint_path):
-        checkpoint = torch.load(checkpoint_path)
-        model.load_state_dict(checkpoint["model_state_dict"])
-        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
-        epoch_already_trained = checkpoint["epoch"]
-        print(f"Read in model trained for {epoch_already_trained} epochs")
+    # 2. Load the checkpoints if they exist from previous_pipeline_directory
+    loaded_values = load_checkpoint(
+        previous_pipeline_directory=previous_pipeline_directory,
+        model=model,
+        optimizer=optimizer,
+    )
+
+    if loaded_values is not None:
+        epoch_already_trained = loaded_values["epochs"]
+        # + Anything else saved in the checkpoint.
     else:
         epoch_already_trained = 0
+        # + Anything else with default value.
 
-    # 4. Train or continue training the model based on the specified checkpoint
-    for epoch in range(epoch_already_trained, config["epochs"]):
+    # 3. Train or continue training the model based on the specified checkpoint
+    max_epochs = config["epochs"]
+    for epoch in range(epoch_already_trained, max_epochs):
         val_loss = 0
 
-    # 5. Save the checkpoint data in the current directory
-    torch.save(
-        {
-            "epoch": config["epochs"],
-            "model_state_dict": model.state_dict(),
-            "optimizer_state_dict": optimizer.state_dict(),
-        },
-        f"{pipeline_directory}/checkpoint",
+    # 4. Save the checkpoint data in the current directory
+    save_checkpoint(
+        pipeline_directory=pipeline_directory,
+        values_to_save={"epochs": max_epochs},
+        model=model,
+        optimizer=optimizer,
     )
 
-    # 6. Return a dictionary with the results, or a single float value (loss)
+    # 5. Return a dictionary with the results, or a single float value (loss)
     return {
         "loss": val_loss,
         "info_dict": {