Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT REVIEW AS A PR] little example of how we might use nemo-run for handling SLURM #536

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions slurm-run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-Apache2
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from dataclasses import dataclass
import os
import nemo_run as run
from typing import Optional, Type

import argparse

import yaml

from bionemo.geneformer.run.config_models import (
ExposedFineTuneSeqLenBioBertConfig,
ExposedGeneformerPretrainConfig,
GeneformerPretrainingDataConfig,
)
from bionemo.geneformer.run.main import args_to_args_dict, defer_load
from bionemo.geneformer.run.nemo_run import build_nrargs, NRArgs

from bionemo.llm.run.config_models import MainConfig
from bionemo.llm.train import NsysConfig, train

def slurm_executor(
user: str,
host: str,
remote_job_dir: str,
account: str,
partition: str,
nodes: int,
devices: int,
identity: str,
time: str = "01:00:00",
custom_mounts: Optional[list[str]] = None,
custom_env_vars: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:dev",
retries: int = 0,
) -> run.SlurmExecutor:
if not (user and host and remote_job_dir and account and partition and nodes and devices):
raise RuntimeError(
"Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
)

mounts = []
# Custom mounts are defined here.
if custom_mounts:
mounts.extend(custom_mounts)

# Env vars for jobs are configured here
env_vars = {
"TRANSFORMERS_OFFLINE": "1",
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
"NVTE_FUSED_ATTN": "0",
}
if custom_env_vars:
env_vars |= custom_env_vars

# This defines the slurm executor.
# We connect to the executor via the tunnel defined by user, host and remote_job_dir.
executor = run.SlurmExecutor(
account=account,
partition=partition,
tunnel=run.SSHTunnel(
user=user,
host=host,
job_dir=remote_job_dir, # This is where the results of the run will be stored by default.
identity=identity
),
nodes=nodes,
ntasks_per_node=devices,
gpus_per_node=devices,
mem="0",
exclusive=True,
gres="gpu:8",
)

executor.container_image = container_image
executor.container_mounts = mounts
executor.env_vars = env_vars
executor.retries = retries
executor.time = time

return executor

def main():
from nemo_run import Partial
from bionemo.geneformer.run.argument_parser import parse_args

args = parse_args()
args_dict = args_to_args_dict(args)
recipe = Partial(defer_load, build_nrargs(args_dict))

# or use a simple executor
executor = run.LocalExecutor()

# NOTE: slurm stuff below.
identity="/home/bionemo/.ssh/id_ed25519"
# OPTIONAL: Provide path to the private key that can be used to establish the SSH connection without entering your password.
DRACO="cs-oci-ord-login-03"
# NOTE, how we mount determines the data and results path we would like to push in.
# SRC:
# /lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data/cellxgene_2023-12-15/processed_data
# /lustre:/lustre is the easiest mount

CUSTOM_MOUNTS = [
"/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/results/bionemo2_geneformer_pretraining/bionemo2_geneformer_pretraining:/results",
"/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data:/workspace/data",
"/lustre:/lustre"
]

# TODO how do we get nodes and devices out of our config?
_executor = slurm_executor(
user='skothenhill',
identity=identity,
host=DRACO,
remote_job_dir='/home/skothenhill/20240924-bionemo2/nemorun',
account='healthcareeng_bionemo',
partition='polar',
nodes=1,
devices=8,
custom_mounts = CUSTOM_MOUNTS,
container_image="nvcr.io/nvidia/clara/bionemo-framework:nightly",
custom_env_vars={"WANDB_API_KEY": os.environ.get('WANDB_API_KEY', '')}
)

# Submit a partial object
# There is a way to do this with explicit experiment management but idk how.
run.run(recipe, executor=executor, detach=True, dryrun=False)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from dataclasses import dataclass, field
from typing import Any, Type, List
import argparse

from bionemo.geneformer.run.config_models import ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig

def parse_args():
parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
parser.add_argument(
"--model-config-cls",
default=ExposedGeneformerPretrainConfig,
required=False,
help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
)
parser.add_argument(
"--data-config-cls",
default=GeneformerPretrainingDataConfig,
required=False,
help="fully resolvable python import path to the class.",
)
parser.add_argument(
"--resume-if-exists",
default=False,
action="store_true",
help="Resume training if a checkpoint exists that matches the current experiment configuration.",
)

# Debug options.
parser.add_argument(
"--nsys-profiling",
action="store_true",
default=False,
help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
" `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop [regular python command here]`",
)
# start, end, rank
parser.add_argument(
"--nsys-start-step",
type=int,
required=False,
default=0,
help="Start nsys profiling after this step.",
)
parser.add_argument(
"--nsys-end-step",
type=int,
required=False,
help="End nsys profiling after this step.",
)
# rank as list of integers
parser.add_argument(
"--nsys-ranks",
type=int,
nargs="+",
required=False,
default=[0],
help="Enable nsys profiling for these ranks.",
)

return parser.parse_args()

@dataclass
class NRArgs:
config_dict: dict
model_config_cls: Type
data_config_cls: Type
resume_if_exists: bool
nsys_profiling: bool
nsys_start_step: int
nsys_end_step: int
nsys_ranks: list[int] = field(default_factory=lambda: [0])
139 changes: 52 additions & 87 deletions sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@


import argparse
from typing import Optional

from dataclasses import dataclass, field
from typing import Optional, Type
from nemo_run import Config, autoconvert
import yaml

from bionemo.geneformer.run.config_models import (
Expand All @@ -26,94 +27,58 @@
)
from bionemo.llm.run.config_models import MainConfig
from bionemo.llm.train import NsysConfig, train

from bionemo.geneformer.run.argument_parser import parse_args
from bionemo.geneformer.run.nemo_run import NRArgs

def args_to_args_dict(args) -> dict:
'''Transforms the ArgumentParser namespace into a dictionary with one modification, `config`, which accepts a file path,
is transformed into a serialized dictionary. This allows us to defer parsing until the job is scheduled.

Arguments:
args - argparse namesspace arguments, aquired from parser.parse_args()

Returns:
Dictionary of arguments with `config` replaced by `config_dict`.
'''
args_dict = vars(args)
config_path = args_dict.pop("config")
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)
args_dict['config_dict'] = config_dict
return args_dict

def load_config_from_file(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)
return load_config(config_dict, model_config_cls=model_config_cls, data_config_cls=data_config_cls)

def load_config(config_dict: dict, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
# model/data_config_cls is used to select the parser dynamically.
if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig":
model_config_cls = ExposedGeneformerPretrainConfig
elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig":
# Hardcoded path for those who do not know the full path
model_config_cls = ExposedFineTuneSeqLenBioBertConfig
elif isinstance(model_config_cls, str):
# We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
model_config_cls = string_to_class(model_config_cls)

if data_config_cls is None:
data_config_cls = GeneformerPretrainingDataConfig
elif isinstance(data_config_cls, str):
data_config_cls = string_to_class(data_config_cls)
return MainConfig[model_config_cls, data_config_cls](**config_dict)

def string_to_class(path: str):
import importlib

module_path, class_name = path.rsplit(".", 1)
module = importlib.import_module(module_path)
return getattr(module, class_name)

def main(): # noqa: D103
def parse_args():
parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
parser.add_argument(
"--model-config-cls",
default=ExposedGeneformerPretrainConfig,
required=False,
help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
)
parser.add_argument(
"--data-config-cls",
default=GeneformerPretrainingDataConfig,
required=False,
help="fully resolvable python import path to the class.",
)
parser.add_argument(
"--resume-if-exists",
default=False,
action="store_true",
help="Resume training if a checkpoint exists that matches the current experiment configuration.",
)

# Debug options.
parser.add_argument(
"--nsys-profiling",
action="store_true",
default=False,
help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
" `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop [regular python command here]`",
)
# start, end, rank
parser.add_argument(
"--nsys-start-step",
type=int,
required=False,
default=0,
help="Start nsys profiling after this step.",
)
parser.add_argument(
"--nsys-end-step",
type=int,
required=False,
help="End nsys profiling after this step.",
)
# rank as list of integers
parser.add_argument(
"--nsys-ranks",
type=int,
nargs="+",
required=False,
default=[0],
help="Enable nsys profiling for these ranks.",
)

return parser.parse_args()

def string_to_class(path: str):
import importlib

module_path, class_name = path.rsplit(".", 1)
module = importlib.import_module(module_path)
return getattr(module, class_name)

def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)

# model/data_config_cls is used to select the parser dynamically.
if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig":
model_config_cls = ExposedGeneformerPretrainConfig
elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig":
# Hardcoded path for those who do not know the full path
model_config_cls = ExposedFineTuneSeqLenBioBertConfig
elif isinstance(model_config_cls, str):
# We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
model_config_cls = string_to_class(model_config_cls)

if data_config_cls is None:
data_config_cls = GeneformerPretrainingDataConfig
elif isinstance(data_config_cls, str):
data_config_cls = string_to_class(data_config_cls)
return MainConfig[model_config_cls, data_config_cls](**config_dict)

args = parse_args()
config = load_config(args.config, args.model_config_cls, args.data_config_cls)
config = load_config_from_file(args.config, args.model_config_cls, args.data_config_cls)

if args.nsys_profiling:
nsys_config = NsysConfig(
Expand Down
Loading
Loading