NVIDIA · skothenhill-nv · Dec 10, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/slurm-run.py b/slurm-run.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+import os
+import nemo_run as run
+from typing import Optional, Type
+
+import argparse
+
+import yaml
+
+from bionemo.geneformer.run.config_models import (
+    ExposedFineTuneSeqLenBioBertConfig,
+    ExposedGeneformerPretrainConfig,
+    GeneformerPretrainingDataConfig,
+)
+from bionemo.geneformer.run.main import args_to_args_dict, defer_load
+from bionemo.geneformer.run.nemo_run import build_nrargs, NRArgs
+
+from bionemo.llm.run.config_models import MainConfig
+from bionemo.llm.train import NsysConfig, train
+
+def slurm_executor(
+    user: str,
+    host: str,
+    remote_job_dir: str,
+    account: str,
+    partition: str,
+    nodes: int,
+    devices: int,
+    identity: str,
+    time: str = "01:00:00",
+    custom_mounts: Optional[list[str]] = None,
+    custom_env_vars: Optional[dict[str, str]] = None,
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    retries: int = 0,
+) -> run.SlurmExecutor:
+    if not (user and host and remote_job_dir and account and partition and nodes and devices):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
+        )
+
+    mounts = []
+    # Custom mounts are defined here.
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    # Env vars for jobs are configured here
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    # This defines the slurm executor.
+    # We connect to the executor via the tunnel defined by user, host and remote_job_dir.
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.SSHTunnel(
+            user=user,
+            host=host,
+            job_dir=remote_job_dir, # This is where the results of the run will be stored by default.
+            identity=identity
+        ),
+        nodes=nodes,
+        ntasks_per_node=devices,
+        gpus_per_node=devices,
+        mem="0",
+        exclusive=True,
+        gres="gpu:8",
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.retries = retries
+    executor.time = time
+
+    return executor
+
+def main():
+    from nemo_run import Partial
+    from bionemo.geneformer.run.argument_parser import parse_args
+
+    args = parse_args()
+    args_dict = args_to_args_dict(args)
+    recipe = Partial(defer_load, build_nrargs(args_dict))
+
+    # or use a simple executor
+    executor = run.LocalExecutor()
+
+    # NOTE: slurm stuff below.
+    identity="/home/bionemo/.ssh/id_ed25519"
+    # OPTIONAL: Provide path to the private key that can be used to establish the SSH connection without entering your password.
+    DRACO="cs-oci-ord-login-03"
+    # NOTE, how we mount determines the data and results path we would like to push in.
+    # SRC: 
+    #   /lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data/cellxgene_2023-12-15/processed_data
+    #   /lustre:/lustre is the easiest mount
+
+    CUSTOM_MOUNTS = [
+        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/results/bionemo2_geneformer_pretraining/bionemo2_geneformer_pretraining:/results",
+        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data:/workspace/data",
+        "/lustre:/lustre"
+    ]
+
+    # TODO how do we get nodes and devices out of our config?
+    _executor = slurm_executor(
+        user='skothenhill',
+        identity=identity,
+        host=DRACO,
+        remote_job_dir='/home/skothenhill/20240924-bionemo2/nemorun',
+        account='healthcareeng_bionemo',
+        partition='polar',
+        nodes=1,
+        devices=8,
+        custom_mounts = CUSTOM_MOUNTS,
+        container_image="nvcr.io/nvidia/clara/bionemo-framework:nightly",
+        custom_env_vars={"WANDB_API_KEY": os.environ.get('WANDB_API_KEY', '')}
+    )
+
+    # Submit a partial object
+    # There is a way to do this with explicit experiment management but idk how.
+    run.run(recipe, executor=executor, detach=True, dryrun=False)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,72 @@
+from dataclasses import dataclass, field
+from typing import Any, Type, List
+import argparse
+
+from bionemo.geneformer.run.config_models import ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
+    parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
+    parser.add_argument(
+        "--model-config-cls",
+        default=ExposedGeneformerPretrainConfig,
+        required=False,
+        help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
+    )
+    parser.add_argument(
+        "--data-config-cls",
+        default=GeneformerPretrainingDataConfig,
+        required=False,
+        help="fully resolvable python import path to the class.",
+    )
+    parser.add_argument(
+        "--resume-if-exists",
+        default=False,
+        action="store_true",
+        help="Resume training if a checkpoint exists that matches the current experiment configuration.",
+    )
+
+    # Debug options.
+    parser.add_argument(
+        "--nsys-profiling",
+        action="store_true",
+        default=False,
+        help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
+        " `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`",
+    )
+    # start, end, rank
+    parser.add_argument(
+        "--nsys-start-step",
+        type=int,
+        required=False,
+        default=0,
+        help="Start nsys profiling after this step.",
+    )
+    parser.add_argument(
+        "--nsys-end-step",
+        type=int,
+        required=False,
+        help="End nsys profiling after this step.",
+    )
+    # rank as list of integers
+    parser.add_argument(
+        "--nsys-ranks",
+        type=int,
+        nargs="+",
+        required=False,
+        default=[0],
+        help="Enable nsys profiling for these ranks.",
+    )
+
+    return parser.parse_args()
+
+@dataclass
+class NRArgs:
+    config_dict: dict
+    model_config_cls: Type
+    data_config_cls: Type
+    resume_if_exists: bool
+    nsys_profiling: bool
+    nsys_start_step: int
+    nsys_end_step: int
+    nsys_ranks: list[int] = field(default_factory=lambda: [0])
@@ -15,8 +15,9 @@
 
 
 import argparse
-from typing import Optional
-
+from dataclasses import dataclass, field
+from typing import Optional, Type
+from nemo_run import Config, autoconvert
 import yaml
 
 from bionemo.geneformer.run.config_models import (
@@ -26,94 +27,58 @@
 )
 from bionemo.llm.run.config_models import MainConfig
 from bionemo.llm.train import NsysConfig, train
-
+from bionemo.geneformer.run.argument_parser import parse_args
+from bionemo.geneformer.run.nemo_run import NRArgs
+
+def args_to_args_dict(args) -> dict:
+    '''Transforms the ArgumentParser namespace into a dictionary with one modification, `config`, which accepts a file path,
+    is transformed into a serialized dictionary. This allows us to defer parsing until the job is scheduled.
+
+    Arguments:
+        args - argparse namesspace arguments, aquired from parser.parse_args()
+
+    Returns:
+        Dictionary of arguments with `config` replaced by `config_dict`.
+    '''
+    args_dict = vars(args)
+    config_path = args_dict.pop("config")
+    with open(config_path, "r") as f:
+        config_dict = yaml.safe_load(f)
+    args_dict['config_dict'] = config_dict
+    return args_dict
+
+def load_config_from_file(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
+    with open(config_path, "r") as f:
+        config_dict = yaml.safe_load(f)
+    return load_config(config_dict, model_config_cls=model_config_cls, data_config_cls=data_config_cls)
+
+def load_config(config_dict: dict, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
+    # model/data_config_cls is used to select the parser dynamically.
+    if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig":
+        model_config_cls = ExposedGeneformerPretrainConfig
+    elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig":
+        # Hardcoded path for those who do not know the full path
+        model_config_cls = ExposedFineTuneSeqLenBioBertConfig
+    elif isinstance(model_config_cls, str):
+        # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
+        model_config_cls = string_to_class(model_config_cls)
+
+    if data_config_cls is None:
+        data_config_cls = GeneformerPretrainingDataConfig
+    elif isinstance(data_config_cls, str):
+        data_config_cls = string_to_class(data_config_cls)
+    return MainConfig[model_config_cls, data_config_cls](**config_dict)
+
+def string_to_class(path: str):
+    import importlib
+
+    module_path, class_name = path.rsplit(".", 1)
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
 
 def main():  # noqa: D103
-    def parse_args():
-        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
-        parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
-        parser.add_argument(
-            "--model-config-cls",
-            default=ExposedGeneformerPretrainConfig,
-            required=False,
-            help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
-        )
-        parser.add_argument(
-            "--data-config-cls",
-            default=GeneformerPretrainingDataConfig,
-            required=False,
-            help="fully resolvable python import path to the class.",
-        )
-        parser.add_argument(
-            "--resume-if-exists",
-            default=False,
-            action="store_true",
-            help="Resume training if a checkpoint exists that matches the current experiment configuration.",
-        )
-
-        # Debug options.
-        parser.add_argument(
-            "--nsys-profiling",
-            action="store_true",
-            default=False,
-            help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
-            " `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`",
-        )
-        # start, end, rank
-        parser.add_argument(
-            "--nsys-start-step",
-            type=int,
-            required=False,
-            default=0,
-            help="Start nsys profiling after this step.",
-        )
-        parser.add_argument(
-            "--nsys-end-step",
-            type=int,
-            required=False,
-            help="End nsys profiling after this step.",
-        )
-        # rank as list of integers
-        parser.add_argument(
-            "--nsys-ranks",
-            type=int,
-            nargs="+",
-            required=False,
-            default=[0],
-            help="Enable nsys profiling for these ranks.",
-        )
-
-        return parser.parse_args()
-
-    def string_to_class(path: str):
-        import importlib
-
-        module_path, class_name = path.rsplit(".", 1)
-        module = importlib.import_module(module_path)
-        return getattr(module, class_name)
-
-    def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
-        with open(config_path, "r") as f:
-            config_dict = yaml.safe_load(f)
-
-        # model/data_config_cls is used to select the parser dynamically.
-        if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig":
-            model_config_cls = ExposedGeneformerPretrainConfig
-        elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig":
-            # Hardcoded path for those who do not know the full path
-            model_config_cls = ExposedFineTuneSeqLenBioBertConfig
-        elif isinstance(model_config_cls, str):
-            # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
-            model_config_cls = string_to_class(model_config_cls)
-
-        if data_config_cls is None:
-            data_config_cls = GeneformerPretrainingDataConfig
-        elif isinstance(data_config_cls, str):
-            data_config_cls = string_to_class(data_config_cls)
-        return MainConfig[model_config_cls, data_config_cls](**config_dict)
-
     args = parse_args()
-    config = load_config(args.config, args.model_config_cls, args.data_config_cls)
+    config = load_config_from_file(args.config, args.model_config_cls, args.data_config_cls)
 
     if args.nsys_profiling:
         nsys_config = NsysConfig(