From d28acd3322f6dfafda7cf7c3d8f7c428100f3b14 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Thu, 12 Mar 2026 12:45:37 -0700 Subject: [PATCH 01/12] add: ModelOpt Launcher for Slurm job submission Add launcher/ module with launch.py that submits quantization, training, and evaluation jobs to Slurm clusters via nemo-run. Produces identical code/ layout as nmm-sandbox's slurm.py so the same YAML configs work in both. Includes Megatron-LM and Model-Optimizer as submodules. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- .gitmodules | 6 + launcher/__init__.py | 16 ++ launcher/launch.py | 461 +++++++++++++++++++++++++++++++ launcher/modules/Megatron-LM | 1 + launcher/modules/Model-Optimizer | 1 + launcher/pyproject.toml | 12 + 6 files changed, 497 insertions(+) create mode 100644 .gitmodules create mode 100644 launcher/__init__.py create mode 100644 launcher/launch.py create mode 160000 launcher/modules/Megatron-LM create mode 160000 launcher/modules/Model-Optimizer create mode 100644 launcher/pyproject.toml diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..23a5af209 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "launcher/modules/Megatron-LM"] + path = launcher/modules/Megatron-LM + url = https://github.com/AAnoosheh/Megatron-LM.git +[submodule "launcher/modules/Model-Optimizer"] + path = launcher/modules/Model-Optimizer + url = https://github.com/NVIDIA/Model-Optimizer.git diff --git a/launcher/__init__.py b/launcher/__init__.py new file mode 100644 index 000000000..11b92d8b7 --- /dev/null +++ b/launcher/__init__.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters.""" diff --git a/launcher/launch.py b/launcher/launch.py new file mode 100644 index 000000000..19b462f90 --- /dev/null +++ b/launcher/launch.py @@ -0,0 +1,461 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters. + +Usage: + uv run launch.py task=@configs/quantize/Qwen3-8B.yaml --yes + uv run launch.py pipeline=@configs/pipeline/eagle3.yaml --yes + uv run launch.py task=@configs/quantize/Qwen3-8B.yaml hf_local=/mnt/hf-local --yes + +Environment variables: + SLURM_HOST Slurm login node hostname (required for remote jobs) + SLURM_ACCOUNT Slurm account/partition billing (default: from YAML) + SLURM_JOB_DIR Remote directory for job artifacts + SLURM_HF_LOCAL Path to HuggingFace model cache on the cluster + HF_TOKEN HuggingFace API token + NEMORUN_HOME NeMo Run home directory (default: current working directory) +""" + +import dataclasses +import getpass +import json +import os +import re +import warnings +from dataclasses import dataclass + +import nemo_run as run +import yaml + +# --------------------------------------------------------------------------- +# Slurm configuration +# --------------------------------------------------------------------------- + + +@dataclass +class SlurmConfig: + """Cluster-agnostic Slurm configuration. + + Users define cluster details in their YAML configs or override via CLI. + No internal cluster defaults are embedded here. + """ + + host: str | None = None + port: int = 22 + account: str | None = None + partition: str = "batch" + container: str | None = None + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt" + container_mounts: list[str] | None = None + srun_args: list[str] | None = None + array: str | None = None + nodes: int = 1 + ntasks_per_node: int = 1 + gpus_per_node: int = 1 + local: bool = False + + +@run.cli.factory +@run.autoconvert +def slurm_factory( + host: str = os.environ.get("SLURM_HOST", ""), + account: str = os.environ.get("SLURM_ACCOUNT", ""), + partition: str = "batch", + nodes: int = 1, + ntasks_per_node: int = 1, + gpus_per_node: int = 1, + container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5", + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt", + container_mounts: list[str] | None = None, + srun_args: list[str] | None = None, + array: str | None = None, +) -> SlurmConfig: + """Generic Slurm factory — configure via environment variables or CLI overrides.""" + if container_mounts is None: + hf_local = os.environ.get("SLURM_HF_LOCAL", "/hf-local") + container_mounts = ["{}:/hf-local".format(hf_local)] + if srun_args is None: + srun_args = ["--no-container-mount-home"] + return SlurmConfig( + host=host, + account=account, + partition=partition, + nodes=nodes, + ntasks_per_node=ntasks_per_node, + gpus_per_node=gpus_per_node, + container=container, + modelopt_install_path=modelopt_install_path, + container_mounts=container_mounts, + srun_args=srun_args, + array=array, + ) + + +# --------------------------------------------------------------------------- +# Default environment variables injected into every job +# --------------------------------------------------------------------------- + +DEFAULT_SLURM_ENV = { + "HF_HOME": "/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", + "LAUNCH_SCRIPT": "python", +} + +DEFAULT_LOCAL_ENV = { + "HF_HOME": "/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", +} + + +# --------------------------------------------------------------------------- +# Task and pipeline dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class SandboxTask: + """A single task with a script, slurm config, args, and environment.""" + + script: str = None + slurm_config: SlurmConfig = None + args: list[str] = None + environment: list[dict] = None + yaml_file: str = None + + +@dataclass +class SandboxTask0(SandboxTask): + """Task slot 0 in a pipeline.""" + + +@dataclass +class SandboxTask1(SandboxTask): + """Task slot 1 in a pipeline.""" + + +@dataclass +class SandboxTask2(SandboxTask): + """Task slot 2 in a pipeline.""" + + +@dataclass +class SandboxTask3(SandboxTask): + """Task slot 3 in a pipeline.""" + + +@dataclass +class SandboxTask4(SandboxTask): + """Task slot 4 in a pipeline.""" + + +def create_task_from_yaml(yaml_file: str) -> SandboxTask: + """Create a SandboxTask from a YAML config file.""" + with open(yaml_file) as file: + config_from_yaml = yaml.safe_load(file) + + script = config_from_yaml["script"] + function_name = config_from_yaml["slurm_config"].pop("_factory_") + slurm_config = globals()[function_name](**config_from_yaml["slurm_config"]) + args = config_from_yaml.get("args", None) + environment = config_from_yaml.get("environment", None) + + return SandboxTask(script=script, slurm_config=slurm_config, args=args, environment=environment) + + +@dataclass +class GlobalVariables: + """Shared variables for <> interpolation in pipeline YAMLs.""" + + hf_model: str = None + hf_data: str = None + + +@dataclass +class SandboxPipeline: + """A multi-task pipeline with shared global variables and task dependencies.""" + + global_vars: GlobalVariables = None + + task_0: SandboxTask0 = None + task_1: SandboxTask1 = None + task_2: SandboxTask2 = None + task_3: SandboxTask3 = None + task_4: SandboxTask4 = None + tasks: list[SandboxTask] = None + + test_level: int = 0 + allow_to_fail: bool = False + skip: bool = False + note: str = "" + task_configs: list[str] = None + experiment = None + + def __post_init__(self): + if self.tasks is None: + self.tasks = [] + for i in range(5): + task = getattr(self, "task_{}".format(i), None) + if task is not None: + self.tasks += [task] + if self.task_configs is not None: + self.tasks += [ + create_task_from_yaml(yaml_file=yaml_file) for yaml_file in self.task_configs + ] + + if self.global_vars is not None: + global_vars_dict = { + k: v for k, v in dataclasses.asdict(self.global_vars).items() if v is not None + } + + def _resolve(s): + if not isinstance(s, str): + return s + return re.sub( + r"<>", + lambda m: global_vars_dict.get(m.group(1), m.group(0)), + s, + ) + + for task in self.tasks: + if task.environment: + if isinstance(task.environment, list): + task.environment = [ + {k: _resolve(v) for k, v in item.items()} for item in task.environment + ] + else: + task.environment = {k: _resolve(v) for k, v in task.environment.items()} + if task.args: + task.args = [_resolve(a) for a in task.args] + + +# --------------------------------------------------------------------------- +# Code packager — sync only the necessary source trees to the cluster +# --------------------------------------------------------------------------- + +# Resolve paths relative to Model-Optimizer root (parent of launcher/) +LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) +MODELOPT_ROOT = os.path.dirname(LAUNCHER_DIR) + +# All paths relative to LAUNCHER_DIR so code/ mirrors the launcher directory. +# This produces the same layout as nmm-sandbox's slurm.py: +# code/modules/Megatron-LM/megatron/... +# code/modules/Model-Optimizer/modelopt/... +# code/services/... +packager = run.PatternPackager( + include_pattern=[ + "modules/Megatron-LM/megatron/*", + "modules/Megatron-LM/examples/*", + "modules/Megatron-LM/*.py", + "modules/Model-Optimizer/modelopt/*", + "modules/Model-Optimizer/examples/*", + "services/*", + "tests/*", + ], + relative_path=[LAUNCHER_DIR] * 7, +) + + +# --------------------------------------------------------------------------- +# Executor builders +# --------------------------------------------------------------------------- + + +def get_slurm_executor(user, identity, slurm_config, experiment_id, job_dir, task_name): + """Build a SlurmExecutor for remote job submission.""" + container_mounts = slurm_config.container_mounts or [] + + scratch_dst = "/scratchspace" + scratch_src = job_dir + "/cicd/" + experiment_id + modelopt_dst = slurm_config.modelopt_install_path + modelopt_src = ( + job_dir + + "/cicd/" + + experiment_id + + "/{}/code/modules/Model-Optimizer/modelopt".format(task_name) + ) + container_mounts = [ + *container_mounts, + scratch_src + ":" + scratch_dst, + modelopt_src + ":" + modelopt_dst, + ] + + tunnel = run.SSHTunnel( + host=slurm_config.host, + user=getpass.getuser() if user is None else user, + port=slurm_config.port, + job_dir=job_dir, + identity=identity, + ) + + executor = run.SlurmExecutor( + account=slurm_config.account, + partition=slurm_config.partition, + ntasks_per_node=slurm_config.ntasks_per_node, + gpus_per_node=slurm_config.gpus_per_node, + nodes=slurm_config.nodes, + tunnel=tunnel, + container_image=slurm_config.container, + container_mounts=container_mounts, + array=slurm_config.array, + time="04:00:00", + mem="0", + retries=0, + packager=packager, + srun_args=slurm_config.srun_args, + ) + return executor + + +def get_docker_executor(hf_local, slurm_config, experiment_id, job_dir, task_name): + """Build a DockerExecutor for local GPU jobs.""" + if slurm_config.local: + container_mounts = list(slurm_config.container_mounts or []) + else: + container_mounts = [] + container_mounts += [hf_local + ":/hf-local", job_dir + "/cicd:/cicd"] + + scratch_dst = "/scratchspace" + scratch_src = job_dir + "/cicd/" + experiment_id + "/" + task_name + modelopt_dst = slurm_config.modelopt_install_path + modelopt_src = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") + container_mounts += [scratch_src + ":" + scratch_dst, modelopt_src + ":" + modelopt_dst] + + executor = run.DockerExecutor( + num_gpus=-1, + runtime="nvidia", + ipc_mode="host", + container_image=slurm_config.container, + volumes=container_mounts, + additional_kwargs={"user": "{}:{}".format(os.getuid(), os.getgid())}, + packager=packager, + ) + return executor + + +# --------------------------------------------------------------------------- +# Main entrypoint +# --------------------------------------------------------------------------- + + +@run.cli.entrypoint +def launch( + ctx: run.cli.RunContext, + job_name: str = "01_job", + job_dir: str = os.environ.get("SLURM_JOB_DIR", os.path.expanduser("~/experiments")), + task: SandboxTask | None = None, + pipeline: SandboxPipeline | None = None, + hf_local: str | None = None, + user: str = getpass.getuser(), + identity: str | None = None, +) -> None: + """Launch ModelOpt jobs on Slurm or locally with Docker. + + Args: + job_name: Name of the job. + job_dir: Remote directory for job artifacts. + task: Single task config (from YAML). + pipeline: Multi-task pipeline config (from YAML). + hf_local: Path to local HF cache (enables local Docker execution). + user: SSH user for Slurm tunnel. + identity: SSH identity file for Slurm tunnel. + """ + if "NEMORUN_HOME" not in os.environ: + warnings.warn("NEMORUN_HOME is not set. Defaulting to current working directory.") + run.config.set_nemorun_home(os.environ.get("NEMORUN_HOME", os.getcwd())) + + if hf_local is not None: + job_dir = os.getcwd() + "/experiments" + + job_table = {} + + if task is not None: + job_table[job_name] = SandboxPipeline(tasks=[task]) + elif pipeline is not None: + job_table[job_name] = pipeline + else: + print("No task or pipeline provided. Use task=@ or pipeline=@.") + return + + for job_name, job in job_table.items(): # noqa: PLR1704 + if job.skip: + continue + + dependency = None + exp = run.Experiment("modelopt", log_level="INFO") + job.experiment = exp + + with exp: + for task_id, task in enumerate(job.tasks): # noqa: PLR1704 + task_name = job_name + "_" + str(task_id) + task_args = [] if task.args is None else task.args + + task_env = {} + if task.environment is not None: + if isinstance(task.environment, list): + for item in task.environment: + task_env.update(item.items()) + else: + task_env = task.environment + for k, v in task_env.items(): + task_env[k] = "" if v is None else str(v) + if hf_local is not None: + executor = get_docker_executor( + hf_local, task.slurm_config, exp._id, job_dir, task_name + ) + task_env.update(DEFAULT_LOCAL_ENV) + else: + executor = get_slurm_executor( + user, identity, task.slurm_config, exp._id, job_dir, task_name + ) + task_env.update(DEFAULT_SLURM_ENV) + + task_instance = run.Script(task.script, args=task_args, env=task_env) + print( + "job {} task {} slurm_config: {}".format(job_name, task_id, task.slurm_config) + ) + + if dependency is None: + dependency = exp.add( + task_instance, tail_logs=True, name=task_name, executor=executor + ) + else: + dependency = exp.add( + task_instance, + tail_logs=True, + name=task_name, + executor=executor, + dependencies=[dependency], + ) + + exp.run(detach=ctx.detach) + + # Write metadata for downstream tools + metadata = { + "experiment_id": exp._id, + "job_name": job_name, + "allow_to_fail": job.allow_to_fail, + "note": job.note, + } + metadata_path = os.path.join("experiments", "modelopt", exp._id, "metadata.json") + os.makedirs(os.path.dirname(metadata_path), exist_ok=True) + with open(metadata_path, "w") as f: + json.dump(metadata, f) + + +if __name__ == "__main__": + run.cli.main(launch) diff --git a/launcher/modules/Megatron-LM b/launcher/modules/Megatron-LM new file mode 160000 index 000000000..1e064f361 --- /dev/null +++ b/launcher/modules/Megatron-LM @@ -0,0 +1 @@ +Subproject commit 1e064f361256f34bf179c0cb808fd6287538f85a diff --git a/launcher/modules/Model-Optimizer b/launcher/modules/Model-Optimizer new file mode 160000 index 000000000..69c0d4794 --- /dev/null +++ b/launcher/modules/Model-Optimizer @@ -0,0 +1 @@ +Subproject commit 69c0d47946086d032e665ecf59a9ff28dc32f5b8 diff --git a/launcher/pyproject.toml b/launcher/pyproject.toml new file mode 100644 index 000000000..3a11c2a47 --- /dev/null +++ b/launcher/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "modelopt-launcher" +version = "0.1.0" +description = "ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters" +requires-python = ">=3.10" +dependencies = [ + "nemo_run", + "pyyaml", +] + +[tool.uv] +dev-dependencies = [] From f3d302008c453f0ddd74f0fe1638e46858b0950b Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Fri, 13 Mar 2026 10:23:07 -0700 Subject: [PATCH 02/12] add: shared core.py, slurm_config, services, and Qwen3-8B example Extract shared logic (dataclasses, executor builders, run loop, version reporting) into core.py. Both launch.py and nmm-sandbox's slurm.py import from core.py to avoid divergence. Add slurm_config.py with generic env-var-driven factory, service scripts, Qwen3-8B PTQ example, and README with usage, flags, and bug reporting instructions. Verified: same YAML produces identical MMLU 0.736 on OCI-HSG and 0.719 locally via both slurm.py and launch.py. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml | 13 + launcher/README.md | 204 ++++++++ launcher/core.py | 477 ++++++++++++++++++ launcher/launch.py | 411 ++------------- launcher/pyproject.toml | 6 +- .../services/megatron-lm/quantize/quantize.sh | 47 ++ launcher/services/service_utils.sh | 62 +++ launcher/slurm_config.py | 77 +++ 8 files changed, 916 insertions(+), 381 deletions(-) create mode 100644 launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml create mode 100644 launcher/README.md create mode 100644 launcher/core.py create mode 100755 launcher/services/megatron-lm/quantize/quantize.sh create mode 100755 launcher/services/service_utils.sh create mode 100644 launcher/slurm_config.py diff --git a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml new file mode 100644 index 000000000..e2011c2ae --- /dev/null +++ b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -0,0 +1,13 @@ +script: services/megatron-lm/quantize/quantize.sh +args: + - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-size 32 +environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - TP: 1 +slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 diff --git a/launcher/README.md b/launcher/README.md new file mode 100644 index 000000000..7f87782d8 --- /dev/null +++ b/launcher/README.md @@ -0,0 +1,204 @@ +# ModelOpt Launcher + +Submit ModelOpt quantization, training, and evaluation jobs to Slurm clusters or run them locally with Docker. + +## Quick Start + +```bash +# Install dependencies +curl -LsSf https://astral.sh/uv/install.sh | sh +git submodule update --init --recursive + +# Run locally (requires local GPUs and Docker) +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + +# Run on a Slurm cluster +export SLURM_HOST=login-node.example.com +export SLURM_ACCOUNT=my_account +export SLURM_HF_LOCAL=/shared/hf-local +export SLURM_JOB_DIR=/shared/experiments +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` + +## Environment Variables + +| Variable | Description | Required | +|---|---|---| +| `SLURM_HOST` | Slurm login node hostname | Yes (remote jobs) | +| `SLURM_ACCOUNT` | Slurm account for billing | Yes (remote jobs) | +| `SLURM_JOB_DIR` | Remote directory for job artifacts | Yes (remote jobs) | +| `SLURM_HF_LOCAL` | Path to HuggingFace model cache on the cluster | Yes (remote jobs) | +| `HF_TOKEN` | HuggingFace API token | No | +| `NEMORUN_HOME` | NeMo Run home directory (default: cwd) | No | + +## Directory Structure + +```text +launcher/ +├── launch.py # Main entrypoint +├── slurm_config.py # SlurmConfig dataclass and factory +├── pyproject.toml # Dependencies (nemo-run, pyyaml) +├── services/ # Shell scripts executed on the cluster +│ ├── service_utils.sh # Error handling, MPI rank utilities +│ └── megatron-lm/quantize/ +│ └── quantize.sh # PTQ quantization + MMLU evaluation +├── Qwen/Qwen3-8B/ # Example: Qwen3-8B quantization config +│ └── megatron_lm_ptq.yaml +└── modules/ # Git submodules + ├── Megatron-LM/ # NVIDIA Megatron-LM training framework + └── Model-Optimizer/ # NVIDIA ModelOpt library +``` + +## Task YAML Format + +A task YAML defines the script to run, its arguments, environment variables, and Slurm configuration: + +```yaml +script: services/megatron-lm/quantize/quantize.sh +args: + - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-size 32 +environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - TP: 1 +slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 +``` + +### Overriding Parameters + +Any parameter can be overridden from the command line: + +```bash +# Change the number of nodes +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + task.slurm_config.nodes=2 --yes + +# Change the container image +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + task.slurm_config.container=nvcr.io/nvidia/tensorrt-llm/release:1.3.0 --yes +``` + +### Useful Flags + +| Flag | Description | +|---|---| +| `--yes` / `-y` | Skip confirmation prompt | +| `-v` | Verbose output | +| `--dryrun` | Resolve and print the full config without running | +| `--to-yaml output.yaml` | Dump the resolved config to a YAML file without running | +| `detach=true` | Submit the job and return immediately (don't wait for completion) | + +```bash +# Preview the resolved config (all factory defaults expanded) +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v + +# Dump resolved config to file for inspection or reproducibility +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml + +# Reproduce from a dumped config (remove the first _partial_ line) +tail -n +2 resolved.yaml > clean.yaml +uv run launch.py --yaml clean.yaml --yes + +# Submit and detach +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml detach=true --yes +``` + +## Pipeline YAML Format + +A pipeline chains multiple tasks with shared variables and sequential dependencies: + +```yaml +global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + +task_0: + script: services/megatron-lm/quantize/quantize.sh + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + +task_1: + script: services/megatron-lm/export/export.sh + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 +``` + +Launch with: + +```bash +uv run launch.py pipeline=@my_pipeline.yaml --yes +``` + +Tasks run sequentially — `task_1` starts only after `task_0` completes. The `<>` syntax allows sharing values across tasks. + +## Adding a New Model + +1. Create a directory: `//` +2. Add a YAML config (e.g., `megatron_lm_ptq.yaml`) following the task format above +3. Set `MLM_MODEL_CFG` to the HuggingFace model ID +4. Choose `QUANT_CFG` (e.g., `NVFP4_DEFAULT_CFG`, `INT8_DEFAULT_CFG`) +5. Set `nodes`, `ntasks_per_node`, `gpus_per_node` based on model size + +## How It Works + +1. `launch.py` parses the YAML and creates a `SandboxTask` with a `SlurmConfig` +2. Code is packaged via `PatternPackager` — only `modules/Megatron-LM/`, `modules/Model-Optimizer/`, and `services/` are synced +3. For remote jobs: code is rsynced to the cluster, an sbatch script is generated and submitted via SSH +4. For local jobs: a Docker container is launched with the same container image and mounts +5. The `code/` directory on the cluster mirrors the launcher structure: + +```text +code/ +├── modules/ +│ ├── Megatron-LM/megatron/... +│ └── Model-Optimizer/modelopt/... +└── services/... +``` + +## Reporting Bugs + +When filing a bug report, please include: + +1. **Version summary** — printed at the start of every run: + + ```text + ============================================================ + Version Report + ============================================================ + Launcher d28acd33 (main) + Megatron-LM 1e064f361 (main) + Model-Optimizer 69c0d479 (main) + ============================================================ + ``` + +2. **Reproducible config** — dump with `--to-yaml`: + + ```bash + uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml bug_report.yaml + ``` + +3. **Error output** — the relevant error message or traceback from the job log. + +File issues at: + +## Compatibility with nmm-sandbox + +This launcher produces the same `code/` layout as [nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)'s `slurm.py`. The same YAML configs work with both: + +```bash +# From nmm-sandbox (internal) +uv run slurm.py task=@modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# From Model-Optimizer/launcher (public) +uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` diff --git a/launcher/core.py b/launcher/core.py new file mode 100644 index 000000000..f75035f6e --- /dev/null +++ b/launcher/core.py @@ -0,0 +1,477 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared core logic for ModelOpt launcher and nmm-sandbox slurm.py. + +This module contains all dataclasses, executor builders, and the job run loop +shared between the public launcher (launch.py) and the internal CI orchestrator +(slurm.py). Each caller provides its own config (packager, defaults, experiment +title) and thin entrypoint. +""" + +import dataclasses +import getpass +import json +import os +import re +from dataclasses import dataclass + +import nemo_run as run +import yaml + +# --------------------------------------------------------------------------- +# Default environment variables injected into every job +# --------------------------------------------------------------------------- + +DEFAULT_EXPERIMENT_TITLE = "cicd" + + +def get_default_env(experiment_title=None): + """Return (slurm_env, local_env) dicts for the given experiment title.""" + title = experiment_title or DEFAULT_EXPERIMENT_TITLE + slurm_env = { + "TRITON_CACHE_DIR": f"/{title}/triton-cache", + "HF_HOME": f"/{title}/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", + "LAUNCH_SCRIPT": "python", + } + local_env = { + "TRITON_CACHE_DIR": f"/{title}/triton-cache", + "HF_HOME": f"/{title}/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", + } + return slurm_env, local_env + + +# SlurmConfig type — set by the caller via set_slurm_config_type() before use. +# This allows both slurm.py and launch.py to use their own SlurmConfig class. +_SLURM_CONFIG_TYPE = None + + +def set_slurm_config_type(cls): + """Register the SlurmConfig dataclass type used by SandboxTask.""" + global _SLURM_CONFIG_TYPE + _SLURM_CONFIG_TYPE = cls + # Patch SandboxTask's type annotation so nemo-run's CLI parser can resolve factories + SandboxTask.__dataclass_fields__["slurm_config"].type = cls + SandboxTask.__annotations__["slurm_config"] = cls + + +# --------------------------------------------------------------------------- +# Task and pipeline dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class SandboxTask: + """A single task with a script, slurm config, args, and environment.""" + + script: str = None + slurm_config: object = None # Patched at runtime by set_slurm_config_type() + args: list[str] = None + environment: list[dict[str, str]] = None + yaml_file: str = None + skip: bool = False + + +@dataclass +class SandboxTask0(SandboxTask): + """Task slot 0 in a pipeline.""" + + +@dataclass +class SandboxTask1(SandboxTask): + """Task slot 1 in a pipeline.""" + + +@dataclass +class SandboxTask2(SandboxTask): + """Task slot 2 in a pipeline.""" + + +@dataclass +class SandboxTask3(SandboxTask): + """Task slot 3 in a pipeline.""" + + +@dataclass +class SandboxTask4(SandboxTask): + """Task slot 4 in a pipeline.""" + + +def create_task_from_yaml(yaml_file, factory_lookup): + """Create a SandboxTask from a YAML config file. + + Args: + yaml_file: Path to the YAML config. + factory_lookup: Dict mapping factory names to callable factory functions. + """ + with open(yaml_file) as file: + config_from_yaml = yaml.safe_load(file) + + script = config_from_yaml["script"] + function_name = config_from_yaml["slurm_config"].pop("_factory_") + slurm_config = factory_lookup[function_name](**config_from_yaml["slurm_config"]) + args = config_from_yaml.get("args", None) + environment = config_from_yaml.get("environment", None) + + return SandboxTask(script=script, slurm_config=slurm_config, args=args, environment=environment) + + +@dataclass +class GlobalVariables: + """Shared variables for <> interpolation in pipeline YAMLs.""" + + hf_model: str = None + hf_data: str = None + + +@dataclass +class SandboxPipeline: + """A multi-task pipeline with shared global variables and task dependencies.""" + + global_vars: GlobalVariables = None + + task_0: SandboxTask0 = None + task_1: SandboxTask1 = None + task_2: SandboxTask2 = None + task_3: SandboxTask3 = None + task_4: SandboxTask4 = None + tasks: list[SandboxTask] = None + + test_level: int = 0 + allow_to_fail: bool = False + skip: bool = False + note: str = "" + task_configs: list[str] = None + experiment = None + + # Set by caller — used by create_task_from_yaml + _factory_lookup: dict = None + + def __post_init__(self): + if self.tasks is None: + self.tasks = [] + for i in range(5): + task = getattr(self, f"task_{i}", None) + if task is not None: + self.tasks += [task] + if self.task_configs is not None and self._factory_lookup is not None: + self.tasks += [ + create_task_from_yaml(yaml_file=yf, factory_lookup=self._factory_lookup) + for yf in self.task_configs + ] + + if self.global_vars is not None: + global_vars_dict = { + k: v for k, v in dataclasses.asdict(self.global_vars).items() if v is not None + } + + def _resolve(s): + if not isinstance(s, str): + return s + return re.sub( + r"<>", + lambda m: global_vars_dict.get(m.group(1), m.group(0)), + s, + ) + + for task in self.tasks: + if task.environment: + if isinstance(task.environment, list): + task.environment = [ + {k: _resolve(v) for k, v in item.items()} for item in task.environment + ] + else: + task.environment = {k: _resolve(v) for k, v in task.environment.items()} + if task.args: + task.args = [_resolve(a) for a in task.args] + + +# --------------------------------------------------------------------------- +# Executor builders +# --------------------------------------------------------------------------- + + +def build_slurm_executor( + user, + identity, + slurm_config, + experiment_id, + job_dir, + task_name, + packager, + experiment_title="cicd", +): + """Build a SlurmExecutor for remote job submission.""" + container_mounts = list(slurm_config.container_mounts or []) + + scratch_dst = "/scratchspace" + scratch_src = f"{job_dir}/{experiment_title}/{experiment_id}" + modelopt_dst = slurm_config.modelopt_install_path + modelopt_src = ( + f"{job_dir}/{experiment_title}/{experiment_id}" + f"/{task_name}/code/modules/Model-Optimizer/modelopt" + ) + container_mounts += [ + f"{scratch_src}:{scratch_dst}", + f"{modelopt_src}:{modelopt_dst}", + f"{job_dir}/{experiment_title}:/{experiment_title}", + ] + + tunnel = run.SSHTunnel( + host=slurm_config.host, + user=getpass.getuser() if user is None else user, + port=slurm_config.port, + job_dir=job_dir, + identity=identity, + ) + + executor = run.SlurmExecutor( + account=slurm_config.account, + partition=slurm_config.partition, + ntasks_per_node=slurm_config.ntasks_per_node, + gpus_per_node=slurm_config.gpus_per_node, + nodes=slurm_config.nodes, + tunnel=tunnel, + container_image=slurm_config.container, + container_mounts=container_mounts, + array=slurm_config.array, + time="04:00:00", + mem="0", + retries=0, + packager=packager, + srun_args=slurm_config.srun_args, + ) + return executor + + +def build_docker_executor( + hf_local, + slurm_config, + experiment_id, + job_dir, + task_name, + packager, + modelopt_src_path=None, + experiment_title="cicd", +): + """Build a DockerExecutor for local GPU jobs.""" + if slurm_config.local: + container_mounts = list(slurm_config.container_mounts or []) + else: + container_mounts = [] + container_mounts += [f"{hf_local}:/hf-local"] + + scratch_dst = "/scratchspace" + scratch_src = os.path.join(job_dir, experiment_title, experiment_id, task_name) + os.makedirs(scratch_src, exist_ok=True) + modelopt_dst = slurm_config.modelopt_install_path + if modelopt_src_path is None: + modelopt_src_path = os.path.join(os.getcwd(), "modules/Model-Optimizer/modelopt") + exp_title_src = os.path.join(job_dir, experiment_title) + os.makedirs(exp_title_src, exist_ok=True) + container_mounts += [ + f"{scratch_src}:{scratch_dst}", + f"{modelopt_src_path}:{modelopt_dst}", + f"{exp_title_src}:/{experiment_title}", + ] + + executor = run.DockerExecutor( + num_gpus=-1, + runtime="nvidia", + ipc_mode="host", + container_image=slurm_config.container, + volumes=container_mounts, + additional_kwargs={"user": f"{os.getuid()}:{os.getgid()}"}, + packager=packager, + ) + return executor + + +# --------------------------------------------------------------------------- +# Version reporting +# --------------------------------------------------------------------------- + + +def _git_info(path): + """Get git commit hash and branch for a directory.""" + import subprocess # nosec B404 + + try: + commit = subprocess.run( # nosec B603 B607 + ["git", "rev-parse", "--short", "HEAD"], + cwd=path, + capture_output=True, + text=True, + timeout=5, + ).stdout.strip() + branch = subprocess.run( # nosec B603 B607 + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=path, + capture_output=True, + text=True, + timeout=5, + ).stdout.strip() + return commit, branch + except Exception: + return "unknown", "unknown" + + +def report_versions(base_dir): + """Print git commit and branch for the launcher and all submodules.""" + print("=" * 60) + print("Version Report") + print("=" * 60) + + # Launcher / repo root + commit, branch = _git_info(base_dir) + print(f" {'Launcher':<30} {commit:<12} ({branch})") + + # Submodules + modules_dir = os.path.join(base_dir, "modules") + if os.path.isdir(modules_dir): + for name in sorted(os.listdir(modules_dir)): + sub_path = os.path.join(modules_dir, name) + if os.path.exists(os.path.join(sub_path, ".git")): + commit, branch = _git_info(sub_path) + print(f" {name:<30} {commit:<12} ({branch})") + + print("=" * 60) + + +# --------------------------------------------------------------------------- +# Shared job run loop +# --------------------------------------------------------------------------- + + +def run_jobs( + job_table, + hf_local, + user, + identity, + job_dir, + packager, + default_slurm_env, + default_local_env, + experiment_title="cicd", + detach=False, + test_level=0, + modelopt_src_path=None, + base_dir=None, +): + """Run all jobs in job_table. + + Args: + job_table: Dict mapping job_name -> SandboxPipeline. + hf_local: Path to local HF cache (None for remote Slurm). + user: SSH user. + identity: SSH identity file. + job_dir: Base directory for job artifacts. + packager: PatternPackager instance. + default_slurm_env: Default env vars for Slurm jobs. + default_local_env: Default env vars for local Docker jobs. + experiment_title: Experiment title (e.g., "cicd" or "modelopt"). + detach: Whether to detach from the experiment. + test_level: Only run jobs with test_level <= this value. + modelopt_src_path: Path to modelopt source for Docker mounts. + base_dir: Base directory for version reporting (default: cwd). + """ + report_versions(base_dir or os.getcwd()) + + for job_name, job in job_table.items(): + if job.test_level > test_level: + job.skip = True + if job.skip: + continue + + dependency = None + exp = run.Experiment(experiment_title, log_level="INFO") + job.experiment = exp + + with exp: + for task_id, task in enumerate(job.tasks): + if task.skip: + print(f"job {job_name} task {task_id}: skipped") + continue + task_name = f"{job_name}_{task_id}" + task_args = [] if task.args is None else task.args + + task_env = {} + if task.environment is not None: + if isinstance(task.environment, list): + for item in task.environment: + task_env.update(item.items()) + else: + task_env = task.environment + for k, v in task_env.items(): + task_env[k] = "" if v is None else str(v) + + if hf_local is not None: + executor = build_docker_executor( + hf_local, + task.slurm_config, + exp._id, + job_dir, + task_name, + packager, + modelopt_src_path, + experiment_title, + ) + task_env.update(default_local_env) + else: + executor = build_slurm_executor( + user, + identity, + task.slurm_config, + exp._id, + job_dir, + task_name, + packager, + experiment_title, + ) + task_env.update(default_slurm_env) + + task_instance = run.Script(task.script, args=task_args, env=task_env) + print(f"job {job_name} task {task_id} slurm_config: {task.slurm_config}") + + if dependency is None: + dependency = exp.add( + task_instance, tail_logs=True, name=task_name, executor=executor + ) + else: + dependency = exp.add( + task_instance, + tail_logs=True, + name=task_name, + executor=executor, + dependencies=[dependency], + ) + + exp.run(detach=detach) + + # Write metadata for downstream tools + metadata = { + "experiment_id": exp._id, + "job_name": job_name, + "allow_to_fail": job.allow_to_fail, + "note": job.note, + } + metadata_path = os.path.join("experiments", experiment_title, exp._id, "metadata.json") + os.makedirs(os.path.dirname(metadata_path), exist_ok=True) + with open(metadata_path, "w") as f: + json.dump(metadata, f) diff --git a/launcher/launch.py b/launcher/launch.py index 19b462f90..f6f1d928f 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -16,9 +16,8 @@ """ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters. Usage: - uv run launch.py task=@configs/quantize/Qwen3-8B.yaml --yes - uv run launch.py pipeline=@configs/pipeline/eagle3.yaml --yes - uv run launch.py task=@configs/quantize/Qwen3-8B.yaml hf_local=/mnt/hf-local --yes + uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes Environment variables: SLURM_HOST Slurm login node hostname (required for remote jobs) @@ -29,233 +28,26 @@ NEMORUN_HOME NeMo Run home directory (default: current working directory) """ -import dataclasses import getpass -import json import os -import re import warnings -from dataclasses import dataclass import nemo_run as run -import yaml - -# --------------------------------------------------------------------------- -# Slurm configuration -# --------------------------------------------------------------------------- - - -@dataclass -class SlurmConfig: - """Cluster-agnostic Slurm configuration. - - Users define cluster details in their YAML configs or override via CLI. - No internal cluster defaults are embedded here. - """ - - host: str | None = None - port: int = 22 - account: str | None = None - partition: str = "batch" - container: str | None = None - modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt" - container_mounts: list[str] | None = None - srun_args: list[str] | None = None - array: str | None = None - nodes: int = 1 - ntasks_per_node: int = 1 - gpus_per_node: int = 1 - local: bool = False - - -@run.cli.factory -@run.autoconvert -def slurm_factory( - host: str = os.environ.get("SLURM_HOST", ""), - account: str = os.environ.get("SLURM_ACCOUNT", ""), - partition: str = "batch", - nodes: int = 1, - ntasks_per_node: int = 1, - gpus_per_node: int = 1, - container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5", - modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt", - container_mounts: list[str] | None = None, - srun_args: list[str] | None = None, - array: str | None = None, -) -> SlurmConfig: - """Generic Slurm factory — configure via environment variables or CLI overrides.""" - if container_mounts is None: - hf_local = os.environ.get("SLURM_HF_LOCAL", "/hf-local") - container_mounts = ["{}:/hf-local".format(hf_local)] - if srun_args is None: - srun_args = ["--no-container-mount-home"] - return SlurmConfig( - host=host, - account=account, - partition=partition, - nodes=nodes, - ntasks_per_node=ntasks_per_node, - gpus_per_node=gpus_per_node, - container=container, - modelopt_install_path=modelopt_install_path, - container_mounts=container_mounts, - srun_args=srun_args, - array=array, - ) - - -# --------------------------------------------------------------------------- -# Default environment variables injected into every job -# --------------------------------------------------------------------------- - -DEFAULT_SLURM_ENV = { - "HF_HOME": "/hf-cache", - "HF_TOKEN": os.getenv("HF_TOKEN", ""), - "MLM_SKIP_INSTALL": "1", - "LAUNCH_SCRIPT": "python", -} - -DEFAULT_LOCAL_ENV = { - "HF_HOME": "/hf-cache", - "HF_TOKEN": os.getenv("HF_TOKEN", ""), - "MLM_SKIP_INSTALL": "1", -} - - -# --------------------------------------------------------------------------- -# Task and pipeline dataclasses -# --------------------------------------------------------------------------- - - -@dataclass -class SandboxTask: - """A single task with a script, slurm config, args, and environment.""" - - script: str = None - slurm_config: SlurmConfig = None - args: list[str] = None - environment: list[dict] = None - yaml_file: str = None - - -@dataclass -class SandboxTask0(SandboxTask): - """Task slot 0 in a pipeline.""" - - -@dataclass -class SandboxTask1(SandboxTask): - """Task slot 1 in a pipeline.""" - - -@dataclass -class SandboxTask2(SandboxTask): - """Task slot 2 in a pipeline.""" - - -@dataclass -class SandboxTask3(SandboxTask): - """Task slot 3 in a pipeline.""" - - -@dataclass -class SandboxTask4(SandboxTask): - """Task slot 4 in a pipeline.""" - - -def create_task_from_yaml(yaml_file: str) -> SandboxTask: - """Create a SandboxTask from a YAML config file.""" - with open(yaml_file) as file: - config_from_yaml = yaml.safe_load(file) - - script = config_from_yaml["script"] - function_name = config_from_yaml["slurm_config"].pop("_factory_") - slurm_config = globals()[function_name](**config_from_yaml["slurm_config"]) - args = config_from_yaml.get("args", None) - environment = config_from_yaml.get("environment", None) - - return SandboxTask(script=script, slurm_config=slurm_config, args=args, environment=environment) - - -@dataclass -class GlobalVariables: - """Shared variables for <> interpolation in pipeline YAMLs.""" - - hf_model: str = None - hf_data: str = None - - -@dataclass -class SandboxPipeline: - """A multi-task pipeline with shared global variables and task dependencies.""" - - global_vars: GlobalVariables = None - - task_0: SandboxTask0 = None - task_1: SandboxTask1 = None - task_2: SandboxTask2 = None - task_3: SandboxTask3 = None - task_4: SandboxTask4 = None - tasks: list[SandboxTask] = None - - test_level: int = 0 - allow_to_fail: bool = False - skip: bool = False - note: str = "" - task_configs: list[str] = None - experiment = None - - def __post_init__(self): - if self.tasks is None: - self.tasks = [] - for i in range(5): - task = getattr(self, "task_{}".format(i), None) - if task is not None: - self.tasks += [task] - if self.task_configs is not None: - self.tasks += [ - create_task_from_yaml(yaml_file=yaml_file) for yaml_file in self.task_configs - ] - - if self.global_vars is not None: - global_vars_dict = { - k: v for k, v in dataclasses.asdict(self.global_vars).items() if v is not None - } - - def _resolve(s): - if not isinstance(s, str): - return s - return re.sub( - r"<>", - lambda m: global_vars_dict.get(m.group(1), m.group(0)), - s, - ) - - for task in self.tasks: - if task.environment: - if isinstance(task.environment, list): - task.environment = [ - {k: _resolve(v) for k, v in item.items()} for item in task.environment - ] - else: - task.environment = {k: _resolve(v) for k, v in task.environment.items()} - if task.args: - task.args = [_resolve(a) for a in task.args] +from core import SandboxPipeline, SandboxTask, get_default_env, run_jobs, set_slurm_config_type +from slurm_config import SlurmConfig, slurm_factory # noqa: F401 +set_slurm_config_type(SlurmConfig) # --------------------------------------------------------------------------- -# Code packager — sync only the necessary source trees to the cluster +# Launcher-specific configuration # --------------------------------------------------------------------------- -# Resolve paths relative to Model-Optimizer root (parent of launcher/) LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) MODELOPT_ROOT = os.path.dirname(LAUNCHER_DIR) -# All paths relative to LAUNCHER_DIR so code/ mirrors the launcher directory. -# This produces the same layout as nmm-sandbox's slurm.py: -# code/modules/Megatron-LM/megatron/... -# code/modules/Model-Optimizer/modelopt/... -# code/services/... +EXPERIMENT_TITLE = "cicd" +DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) + packager = run.PatternPackager( include_pattern=[ "modules/Megatron-LM/megatron/*", @@ -264,125 +56,38 @@ def _resolve(s): "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/examples/*", "services/*", - "tests/*", ], - relative_path=[LAUNCHER_DIR] * 7, + relative_path=[LAUNCHER_DIR] * 6, ) - -# --------------------------------------------------------------------------- -# Executor builders -# --------------------------------------------------------------------------- - - -def get_slurm_executor(user, identity, slurm_config, experiment_id, job_dir, task_name): - """Build a SlurmExecutor for remote job submission.""" - container_mounts = slurm_config.container_mounts or [] - - scratch_dst = "/scratchspace" - scratch_src = job_dir + "/cicd/" + experiment_id - modelopt_dst = slurm_config.modelopt_install_path - modelopt_src = ( - job_dir - + "/cicd/" - + experiment_id - + "/{}/code/modules/Model-Optimizer/modelopt".format(task_name) - ) - container_mounts = [ - *container_mounts, - scratch_src + ":" + scratch_dst, - modelopt_src + ":" + modelopt_dst, - ] - - tunnel = run.SSHTunnel( - host=slurm_config.host, - user=getpass.getuser() if user is None else user, - port=slurm_config.port, - job_dir=job_dir, - identity=identity, - ) - - executor = run.SlurmExecutor( - account=slurm_config.account, - partition=slurm_config.partition, - ntasks_per_node=slurm_config.ntasks_per_node, - gpus_per_node=slurm_config.gpus_per_node, - nodes=slurm_config.nodes, - tunnel=tunnel, - container_image=slurm_config.container, - container_mounts=container_mounts, - array=slurm_config.array, - time="04:00:00", - mem="0", - retries=0, - packager=packager, - srun_args=slurm_config.srun_args, - ) - return executor - - -def get_docker_executor(hf_local, slurm_config, experiment_id, job_dir, task_name): - """Build a DockerExecutor for local GPU jobs.""" - if slurm_config.local: - container_mounts = list(slurm_config.container_mounts or []) - else: - container_mounts = [] - container_mounts += [hf_local + ":/hf-local", job_dir + "/cicd:/cicd"] - - scratch_dst = "/scratchspace" - scratch_src = job_dir + "/cicd/" + experiment_id + "/" + task_name - modelopt_dst = slurm_config.modelopt_install_path - modelopt_src = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") - container_mounts += [scratch_src + ":" + scratch_dst, modelopt_src + ":" + modelopt_dst] - - executor = run.DockerExecutor( - num_gpus=-1, - runtime="nvidia", - ipc_mode="host", - container_image=slurm_config.container, - volumes=container_mounts, - additional_kwargs={"user": "{}:{}".format(os.getuid(), os.getgid())}, - packager=packager, - ) - return executor +MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") # --------------------------------------------------------------------------- -# Main entrypoint +# Entrypoint # --------------------------------------------------------------------------- @run.cli.entrypoint def launch( - ctx: run.cli.RunContext, job_name: str = "01_job", job_dir: str = os.environ.get("SLURM_JOB_DIR", os.path.expanduser("~/experiments")), - task: SandboxTask | None = None, - pipeline: SandboxPipeline | None = None, - hf_local: str | None = None, + task: SandboxTask = None, + pipeline: SandboxPipeline = None, + hf_local: str = None, # noqa: RUF013 user: str = getpass.getuser(), - identity: str | None = None, + identity: str = None, # noqa: RUF013 + detach: bool = False, ) -> None: - """Launch ModelOpt jobs on Slurm or locally with Docker. - - Args: - job_name: Name of the job. - job_dir: Remote directory for job artifacts. - task: Single task config (from YAML). - pipeline: Multi-task pipeline config (from YAML). - hf_local: Path to local HF cache (enables local Docker execution). - user: SSH user for Slurm tunnel. - identity: SSH identity file for Slurm tunnel. - """ + """Launch ModelOpt jobs on Slurm or locally with Docker.""" if "NEMORUN_HOME" not in os.environ: warnings.warn("NEMORUN_HOME is not set. Defaulting to current working directory.") run.config.set_nemorun_home(os.environ.get("NEMORUN_HOME", os.getcwd())) if hf_local is not None: - job_dir = os.getcwd() + "/experiments" + job_dir = os.path.join(os.getcwd(), "local_experiments") job_table = {} - if task is not None: job_table[job_name] = SandboxPipeline(tasks=[task]) elif pipeline is not None: @@ -391,70 +96,20 @@ def launch( print("No task or pipeline provided. Use task=@ or pipeline=@.") return - for job_name, job in job_table.items(): # noqa: PLR1704 - if job.skip: - continue - - dependency = None - exp = run.Experiment("modelopt", log_level="INFO") - job.experiment = exp - - with exp: - for task_id, task in enumerate(job.tasks): # noqa: PLR1704 - task_name = job_name + "_" + str(task_id) - task_args = [] if task.args is None else task.args - - task_env = {} - if task.environment is not None: - if isinstance(task.environment, list): - for item in task.environment: - task_env.update(item.items()) - else: - task_env = task.environment - for k, v in task_env.items(): - task_env[k] = "" if v is None else str(v) - if hf_local is not None: - executor = get_docker_executor( - hf_local, task.slurm_config, exp._id, job_dir, task_name - ) - task_env.update(DEFAULT_LOCAL_ENV) - else: - executor = get_slurm_executor( - user, identity, task.slurm_config, exp._id, job_dir, task_name - ) - task_env.update(DEFAULT_SLURM_ENV) - - task_instance = run.Script(task.script, args=task_args, env=task_env) - print( - "job {} task {} slurm_config: {}".format(job_name, task_id, task.slurm_config) - ) - - if dependency is None: - dependency = exp.add( - task_instance, tail_logs=True, name=task_name, executor=executor - ) - else: - dependency = exp.add( - task_instance, - tail_logs=True, - name=task_name, - executor=executor, - dependencies=[dependency], - ) - - exp.run(detach=ctx.detach) - - # Write metadata for downstream tools - metadata = { - "experiment_id": exp._id, - "job_name": job_name, - "allow_to_fail": job.allow_to_fail, - "note": job.note, - } - metadata_path = os.path.join("experiments", "modelopt", exp._id, "metadata.json") - os.makedirs(os.path.dirname(metadata_path), exist_ok=True) - with open(metadata_path, "w") as f: - json.dump(metadata, f) + run_jobs( + job_table=job_table, + hf_local=hf_local, + user=user, + identity=identity, + job_dir=job_dir, + packager=packager, + default_slurm_env=DEFAULT_SLURM_ENV, + default_local_env=DEFAULT_LOCAL_ENV, + experiment_title=EXPERIMENT_TITLE, + detach=detach, + modelopt_src_path=MODELOPT_SRC_PATH, + base_dir=LAUNCHER_DIR, + ) if __name__ == "__main__": diff --git a/launcher/pyproject.toml b/launcher/pyproject.toml index 3a11c2a47..0e576e5af 100644 --- a/launcher/pyproject.toml +++ b/launcher/pyproject.toml @@ -4,9 +4,9 @@ version = "0.1.0" description = "ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters" requires-python = ">=3.10" dependencies = [ - "nemo_run", + "nemo-run@git+https://github.com/NVIDIA-NeMo/Run@2ccf1c9e68acd157da451721b24635bcc83be87e", "pyyaml", ] -[tool.uv] -dev-dependencies = [] +[dependency-groups] +dev = [] diff --git a/launcher/services/megatron-lm/quantize/quantize.sh b/launcher/services/megatron-lm/quantize/quantize.sh new file mode 100755 index 000000000..d4b3d5248 --- /dev/null +++ b/launcher/services/megatron-lm/quantize/quantize.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/../../service_utils.sh + +util_install_extra_dep + +trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER +################################################################################################### + +if [[ -z ${HF_MODEL_CKPT} ]]; then + export HF_MODEL_CKPT="/hf-local/${MLM_MODEL_CFG}" +fi +export MLM_MODEL_SAVE="/scratchspace/megatron-lm/${MLM_MODEL_CFG}" +export EXPORT_DIR="/scratchspace/export/${MLM_MODEL_CFG}_${QUANT_CFG}" +export MLM_SKIP_INSTALL=1 + +QUANTIZE_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/quantize.sh" +MMLU_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/mmlu.sh" +CONVERT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/convert.sh" +EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh" + +export MLM_EXTRA_ARGS=${@} +${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG} + +export MLM_EXTRA_ARGS="--mmlu-dataset /hf-local/cais/mmlu --fraction 0.01 --lower-bound 0.38 --disable-tqdm" +MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG} + +################################################################################################### + +# This function handles the exit status (fails the CI). +exit_handler $0 diff --git a/launcher/services/service_utils.sh b/launcher/services/service_utils.sh new file mode 100755 index 000000000..f9d15b279 --- /dev/null +++ b/launcher/services/service_utils.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +native_mpi_rank=$OMPI_COMM_WORLD_RANK +native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK +# Works with Slurm launching with `--mpi=pmix` +mpi_rank=${PMIX_RANK:-$native_mpi_rank} +mpi_local_rank=${PMIX_LOCAL_RANK:-$native_mpi_local_rank} + +FAIL=0 +FAIL_EXIT=0 + +function error_handler { + local last_status_code=$? + echo "[ERROR] $1:$2 failed with status $last_status_code." >&2 + + if [[ "$mpi_rank" -eq 0 ]]; then + echo "$1:$2" >&2 + fi + FAIL=1 + FAIL_EXIT=1 +} + +function exit_handler { + if [[ $FAIL_EXIT == 1 ]]; then + exit 1 + fi +} + +function report_result { + if [[ "$mpi_rank" -eq 0 ]]; then + echo "$1" + fi +} + +function util_install_extra_dep { + if [[ "$mpi_local_rank" -eq 0 ]]; then + pip install diskcache + fi +} + +LOCAL_NUM_GPUS=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1) +printf "RANK ${mpi_rank} GPU count: ${LOCAL_NUM_GPUS}\n" + +# Increase the modelopt version number manually +if [[ "$mpi_local_rank" -eq 0 ]]; then + echo "__version__ = '1.0.0'" >> ./modules/Model-Optimizer/modelopt/__init__.py +fi diff --git a/launcher/slurm_config.py b/launcher/slurm_config.py new file mode 100644 index 000000000..53e39aa42 --- /dev/null +++ b/launcher/slurm_config.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Slurm configuration and factory for the ModelOpt Launcher.""" + +import os +from dataclasses import dataclass + +import nemo_run as run + + +@dataclass +class SlurmConfig: + """Cluster-agnostic Slurm configuration. + + Users define cluster details in their YAML configs or override via CLI. + No internal cluster defaults are embedded here. + """ + + host: str = None + port: int = 22 + account: str = None + partition: str = "batch" + container: str = None + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt" + container_mounts: list[str] = None + srun_args: list[str] = None + array: str = None + nodes: int = 1 + ntasks_per_node: int = 1 + gpus_per_node: int = 1 + local: bool = False + + +@run.cli.factory +@run.autoconvert +def slurm_factory( + host: str = os.environ.get("SLURM_HOST", ""), + account: str = os.environ.get("SLURM_ACCOUNT", ""), + partition: str = "batch", + nodes: int = 1, + ntasks_per_node: int = 1, + gpus_per_node: int = 1, + container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5", + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt", + container_mounts: list[str] = [ + "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")), + ], + srun_args: list[str] = ["--no-container-mount-home"], + array: str = None, # noqa: RUF013 +) -> SlurmConfig: + """Generic Slurm factory — configure via environment variables or CLI overrides.""" + return SlurmConfig( + host=host, + account=account, + partition=partition, + nodes=nodes, + ntasks_per_node=ntasks_per_node, + gpus_per_node=gpus_per_node, + container=container, + modelopt_install_path=modelopt_install_path, + container_mounts=container_mounts, + srun_args=srun_args, + array=array, + ) From f7f9878eb4aa1da80e0db12247975bc498e8f862 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Fri, 13 Mar 2026 17:25:26 -0700 Subject: [PATCH 03/12] fix: add factory registry for task_configs YAML resolution Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- launcher/core.py | 18 +++++++++++++----- launcher/launch.py | 12 ++++++++++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/launcher/core.py b/launcher/core.py index f75035f6e..18e22dfe8 100644 --- a/launcher/core.py +++ b/launcher/core.py @@ -60,6 +60,7 @@ def get_default_env(experiment_title=None): # SlurmConfig type — set by the caller via set_slurm_config_type() before use. # This allows both slurm.py and launch.py to use their own SlurmConfig class. _SLURM_CONFIG_TYPE = None +_FACTORY_REGISTRY = {} def set_slurm_config_type(cls): @@ -71,6 +72,11 @@ def set_slurm_config_type(cls): SandboxTask.__annotations__["slurm_config"] = cls +def register_factory(name, fn): + """Register a factory function by name for task_configs YAML resolution.""" + _FACTORY_REGISTRY[name] = fn + + # --------------------------------------------------------------------------- # Task and pipeline dataclasses # --------------------------------------------------------------------------- @@ -170,11 +176,13 @@ def __post_init__(self): task = getattr(self, f"task_{i}", None) if task is not None: self.tasks += [task] - if self.task_configs is not None and self._factory_lookup is not None: - self.tasks += [ - create_task_from_yaml(yaml_file=yf, factory_lookup=self._factory_lookup) - for yf in self.task_configs - ] + if self.task_configs is not None: + lookup = self._factory_lookup or _FACTORY_REGISTRY + if lookup: + self.tasks += [ + create_task_from_yaml(yaml_file=yf, factory_lookup=lookup) + for yf in self.task_configs + ] if self.global_vars is not None: global_vars_dict = { diff --git a/launcher/launch.py b/launcher/launch.py index f6f1d928f..9d9c9c993 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -33,10 +33,18 @@ import warnings import nemo_run as run -from core import SandboxPipeline, SandboxTask, get_default_env, run_jobs, set_slurm_config_type -from slurm_config import SlurmConfig, slurm_factory # noqa: F401 +from core import ( + SandboxPipeline, + SandboxTask, + get_default_env, + register_factory, + run_jobs, + set_slurm_config_type, +) +from slurm_config import SlurmConfig, slurm_factory set_slurm_config_type(SlurmConfig) +register_factory("slurm_factory", slurm_factory) # --------------------------------------------------------------------------- # Launcher-specific configuration From ad1f0d8f98a879cc7c9bd64925a5590b073d2339 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sat, 14 Mar 2026 17:55:15 -0700 Subject: [PATCH 04/12] chg: remove task param from launch.py, update YAML format and README launch.py now only accepts pipeline=@ or --yaml. Update README with --yaml vs pipeline=@ docs, useful flags, and bug reporting. Update Qwen3-8B config to new --yaml format with job_name + pipeline. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml | 34 ++-- launcher/README.md | 170 ++++++++++++-------- launcher/launch.py | 20 +-- 3 files changed, 128 insertions(+), 96 deletions(-) diff --git a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml index e2011c2ae..83ed7f4f0 100644 --- a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml +++ b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -1,13 +1,21 @@ -script: services/megatron-lm/quantize/quantize.sh -args: - - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail - - --calib-size 32 -environment: - - MLM_MODEL_CFG: Qwen/Qwen3-8B - - QUANT_CFG: NVFP4_DEFAULT_CFG - - TP: 1 -slurm_config: - _factory_: "slurm_factory" - nodes: 1 - ntasks_per_node: 4 - gpus_per_node: 4 +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + skip: false + allow_to_fail: false + note: + + task_0: + script: services/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-size 32 + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - TP: 4 + slurm_config: + _factory_: "slurm_factory" # oci_hsg_slurm_factory + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + diff --git a/launcher/README.md b/launcher/README.md index 7f87782d8..725363341 100644 --- a/launcher/README.md +++ b/launcher/README.md @@ -10,14 +10,14 @@ curl -LsSf https://astral.sh/uv/install.sh | sh git submodule update --init --recursive # Run locally (requires local GPUs and Docker) -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes # Run on a Slurm cluster export SLURM_HOST=login-node.example.com export SLURM_ACCOUNT=my_account export SLURM_HF_LOCAL=/shared/hf-local export SLURM_JOB_DIR=/shared/experiments -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes ``` ## Environment Variables @@ -36,37 +36,108 @@ uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes ```text launcher/ ├── launch.py # Main entrypoint +├── core.py # Shared logic (also used by nmm-sandbox's slurm.py) ├── slurm_config.py # SlurmConfig dataclass and factory ├── pyproject.toml # Dependencies (nemo-run, pyyaml) ├── services/ # Shell scripts executed on the cluster │ ├── service_utils.sh # Error handling, MPI rank utilities │ └── megatron-lm/quantize/ -│ └── quantize.sh # PTQ quantization + MMLU evaluation -├── Qwen/Qwen3-8B/ # Example: Qwen3-8B quantization config +│ ├── quantize.sh # PTQ quantization + MMLU evaluation +│ └── Qwen3-8B.yaml # Task config for Qwen3-8B +├── Qwen/Qwen3-8B/ # Example pipeline config │ └── megatron_lm_ptq.yaml └── modules/ # Git submodules ├── Megatron-LM/ # NVIDIA Megatron-LM training framework └── Model-Optimizer/ # NVIDIA ModelOpt library ``` -## Task YAML Format +## YAML Config Format -A task YAML defines the script to run, its arguments, environment variables, and Slurm configuration: +A config YAML defines the job name, pipeline metadata, and one or more tasks: ```yaml -script: services/megatron-lm/quantize/quantize.sh -args: - - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail - - --calib-size 32 -environment: - - MLM_MODEL_CFG: Qwen/Qwen3-8B - - QUANT_CFG: NVFP4_DEFAULT_CFG - - TP: 1 -slurm_config: - _factory_: "slurm_factory" - nodes: 1 - ntasks_per_node: 4 - gpus_per_node: 4 +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + skip: false + allow_to_fail: false + note: + + task_0: + script: services/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-size 32 + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - TP: 1 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 +``` + +### Multi-task Pipeline + +Tasks run sequentially — `task_1` starts only after `task_0` completes: + +```yaml +job_name: Qwen3-8B_quantize_export +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + task_0: + script: services/megatron-lm/quantize/quantize.sh + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + + task_1: + script: services/megatron-lm/export/export.sh + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 +``` + +The `<>` syntax shares values across tasks. + +### `--yaml` vs `pipeline=@` + +There are two ways to load a config: + +**`--yaml config.yaml`** (recommended) — the YAML maps top-level keys to function arguments. +The file contains both `job_name` and `pipeline`: + +```yaml +# config.yaml — used with: uv run launch.py --yaml config.yaml --yes +job_name: Qwen3-8B_NVFP4 +pipeline: + task_0: + script: services/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**`pipeline=@config.yaml`** — the YAML is a bare `SandboxPipeline` (no `job_name` or `pipeline` wrapper). +This is useful for reusing pipeline configs across different job names: + +```yaml +# bare_pipeline.yaml — used with: uv run launch.py pipeline=@bare_pipeline.yaml --yes +task_0: + script: services/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +```bash +# With pipeline=@, set job_name separately +uv run launch.py pipeline=@bare_pipeline.yaml job_name=my_job --yes ``` ### Overriding Parameters @@ -75,12 +146,12 @@ Any parameter can be overridden from the command line: ```bash # Change the number of nodes -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ - task.slurm_config.nodes=2 --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.task_0.slurm_config.nodes=2 --yes # Change the container image -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ - task.slurm_config.container=nvcr.io/nvidia/tensorrt-llm/release:1.3.0 --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.task_0.slurm_config.container=nvcr.io/nvidia/tensorrt-llm/release:1.3.0 --yes ``` ### Useful Flags @@ -95,63 +166,26 @@ uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ ```bash # Preview the resolved config (all factory defaults expanded) -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v # Dump resolved config to file for inspection or reproducibility -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml - -# Reproduce from a dumped config (remove the first _partial_ line) -tail -n +2 resolved.yaml > clean.yaml -uv run launch.py --yaml clean.yaml --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml # Submit and detach -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml detach=true --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml detach=true --yes ``` -## Pipeline YAML Format - -A pipeline chains multiple tasks with shared variables and sequential dependencies: - -```yaml -global_vars: - hf_model: /hf-local/Qwen/Qwen3-8B - -task_0: - script: services/megatron-lm/quantize/quantize.sh - environment: - - HF_MODEL_CKPT: <> - slurm_config: - _factory_: "slurm_factory" - nodes: 1 - -task_1: - script: services/megatron-lm/export/export.sh - environment: - - HF_MODEL_CKPT: <> - slurm_config: - _factory_: "slurm_factory" - nodes: 1 -``` - -Launch with: - -```bash -uv run launch.py pipeline=@my_pipeline.yaml --yes -``` - -Tasks run sequentially — `task_1` starts only after `task_0` completes. The `<>` syntax allows sharing values across tasks. - ## Adding a New Model 1. Create a directory: `//` -2. Add a YAML config (e.g., `megatron_lm_ptq.yaml`) following the task format above +2. Add a YAML config (e.g., `megatron_lm_ptq.yaml`) following the format above 3. Set `MLM_MODEL_CFG` to the HuggingFace model ID 4. Choose `QUANT_CFG` (e.g., `NVFP4_DEFAULT_CFG`, `INT8_DEFAULT_CFG`) 5. Set `nodes`, `ntasks_per_node`, `gpus_per_node` based on model size ## How It Works -1. `launch.py` parses the YAML and creates a `SandboxTask` with a `SlurmConfig` +1. `launch.py` parses the YAML and creates a `SandboxPipeline` with tasks and `SlurmConfig` 2. Code is packaged via `PatternPackager` — only `modules/Megatron-LM/`, `modules/Model-Optimizer/`, and `services/` are synced 3. For remote jobs: code is rsynced to the cluster, an sbatch script is generated and submitted via SSH 4. For local jobs: a Docker container is launched with the same container image and mounts @@ -184,7 +218,7 @@ When filing a bug report, please include: 2. **Reproducible config** — dump with `--to-yaml`: ```bash - uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml bug_report.yaml + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml bug_report.yaml ``` 3. **Error output** — the relevant error message or traceback from the job log. @@ -197,8 +231,8 @@ This launcher produces the same `code/` layout as [nmm-sandbox](https://gitlab-m ```bash # From nmm-sandbox (internal) -uv run slurm.py task=@modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes # From Model-Optimizer/launcher (public) -uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes ``` diff --git a/launcher/launch.py b/launcher/launch.py index 9d9c9c993..7251effd1 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -16,8 +16,8 @@ """ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters. Usage: - uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes - uv run launch.py task=@Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes Environment variables: SLURM_HOST Slurm login node hostname (required for remote jobs) @@ -33,14 +33,7 @@ import warnings import nemo_run as run -from core import ( - SandboxPipeline, - SandboxTask, - get_default_env, - register_factory, - run_jobs, - set_slurm_config_type, -) +from core import SandboxPipeline, get_default_env, register_factory, run_jobs, set_slurm_config_type from slurm_config import SlurmConfig, slurm_factory set_slurm_config_type(SlurmConfig) @@ -80,7 +73,6 @@ def launch( job_name: str = "01_job", job_dir: str = os.environ.get("SLURM_JOB_DIR", os.path.expanduser("~/experiments")), - task: SandboxTask = None, pipeline: SandboxPipeline = None, hf_local: str = None, # noqa: RUF013 user: str = getpass.getuser(), @@ -96,12 +88,10 @@ def launch( job_dir = os.path.join(os.getcwd(), "local_experiments") job_table = {} - if task is not None: - job_table[job_name] = SandboxPipeline(tasks=[task]) - elif pipeline is not None: + if pipeline is not None: job_table[job_name] = pipeline else: - print("No task or pipeline provided. Use task=@ or pipeline=@.") + print("No pipeline provided. Use pipeline=@ or --yaml .") return run_jobs( From 8e083658be786da5181fa7d0c4968f14bbddde37 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sat, 14 Mar 2026 18:59:38 -0700 Subject: [PATCH 05/12] add: common/ scripts, EAGLE3 pipeline, ADVANCED.md Move service scripts to common/ (query.py, query.sh, eagle3, specdec_bench, megatron-lm quantize). Add Qwen3-8B EAGLE3 offline pipeline YAML. Add ADVANCED.md with architecture docs and Claude Code workflows. Update packager to include common/. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Chenhan Yu --- launcher/ADVANCED.md | 242 ++++++++++++++++++ launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml | 111 ++++++++ launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml | 2 +- launcher/common/eagle3/dump_offline_data.sh | 42 +++ launcher/common/eagle3/offline_training.sh | 40 +++ .../megatron-lm/quantize/quantize.sh | 0 launcher/common/query.py | 147 +++++++++++ .../{services => common}/service_utils.sh | 0 launcher/common/specdec_bench/quick_check.sh | 27 ++ launcher/common/tensorrt-llm/query.sh | 130 ++++++++++ launcher/common/vllm/query.sh | 129 ++++++++++ launcher/launch.py | 3 +- 12 files changed, 871 insertions(+), 2 deletions(-) create mode 100644 launcher/ADVANCED.md create mode 100644 launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml create mode 100644 launcher/common/eagle3/dump_offline_data.sh create mode 100644 launcher/common/eagle3/offline_training.sh rename launcher/{services => common}/megatron-lm/quantize/quantize.sh (100%) create mode 100644 launcher/common/query.py rename launcher/{services => common}/service_utils.sh (100%) create mode 100644 launcher/common/specdec_bench/quick_check.sh create mode 100644 launcher/common/tensorrt-llm/query.sh create mode 100755 launcher/common/vllm/query.sh diff --git a/launcher/ADVANCED.md b/launcher/ADVANCED.md new file mode 100644 index 000000000..fb4bc0256 --- /dev/null +++ b/launcher/ADVANCED.md @@ -0,0 +1,242 @@ +# Advanced Guide + +## Architecture + +### Shared Core + +The launcher is built on a shared `core.py` module used by both: + +- **`launch.py`** — public-facing launcher (this repo) +- **`slurm.py`** — internal CI orchestrator ([nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)) + +```text +core.py (shared) +├── Dataclasses: SandboxTask, SandboxPipeline, GlobalVariables +├── Executor builders: build_slurm_executor(), build_docker_executor() +├── Job runner: run_jobs() +├── Version reporter: report_versions() +├── Factory registry: register_factory(), set_slurm_config_type() +└── Default env: get_default_env() + +launch.py slurm.py (nmm-sandbox) +├── imports core.py ├── imports core.py (via sys.path) +├── slurm_config.py (env-var driven) ├── tools/slurm_config.py (cluster-specific) +├── registers: slurm_factory ├── registers: oci_hsg, cw_dfw, computelab, ... +├── packager (LAUNCHER_DIR relative) ├── packager (repo root relative) +└── launch() entrypoint └── cicd() entrypoint +``` + +### Code Packaging + +When a job is submitted, `PatternPackager` creates a tar.gz of the source code and rsyncs it to the cluster. The `code/` directory on the remote mirrors the launcher structure: + +```text +code/ +├── modules/ +│ ├── Megatron-LM/megatron/... # Training framework +│ └── Model-Optimizer/modelopt/... # ModelOpt library (mounted over container install) +└── services/ + └── megatron-lm/quantize/ + └── quantize.sh # Job script +``` + +The `modelopt/` directory is bind-mounted over the container's installed ModelOpt, so your local changes take effect without rebuilding the container. + +### Factory System + +Slurm cluster configs use a factory pattern. YAMLs reference a factory by name: + +```yaml +slurm_config: + _factory_: "slurm_factory" + nodes: 1 +``` + +Factories are registered at import time via `register_factory()`. In `launch.py`, `slurm_factory` reads from environment variables (`SLURM_HOST`, `SLURM_ACCOUNT`, etc.). In `slurm.py`, `slurm_factory` resolves to a cluster-specific factory based on `SLURM_CLUSTER`: + +```bash +# Default (OCI-HSG) +uv run slurm.py --yaml config.yaml --yes + +# Switch cluster +SLURM_CLUSTER=cw_dfw uv run slurm.py --yaml config.yaml --yes +``` + +### YAML Formats + +**`--yaml` format** (recommended) — maps top-level keys to function args: + +```yaml +job_name: Qwen3-8B_NVFP4 +pipeline: + task_0: + script: services/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**`pipeline=@` format** — bare pipeline without wrapper: + +```yaml +task_0: + script: services/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**Test YAML format** — list of jobs with `_target_` and overrides, used by `tools/run_test_yaml.sh`: + +```yaml +- _target_: Qwen/Qwen3-8B/megatron_lm_ptq.yaml + pipeline: + allow_to_fail: true + skip: false + note: "known flaky" +``` + +Overrides are flattened to dot-notation and passed as nemo-run CLI args (e.g., `pipeline.allow_to_fail=True`). + +### Global Variables + +Pipeline YAMLs support `<>` interpolation for sharing values across tasks: + +```yaml +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + task_0: + environment: + - HF_MODEL_CKPT: <> + + task_1: + environment: + - HF_MODEL_CKPT: <> +``` + +This is resolved in `SandboxPipeline.__post_init__` using regex substitution, not OmegaConf (which fails on isolated sub-configs in nemo-run). + +### Metadata + +Each experiment writes `metadata.json` to `experiments//<id>/`: + +```json +{ + "experiment_id": "cicd_1773420387", + "job_name": "Qwen3-8B_NVFP4_DEFAULT_CFG", + "allow_to_fail": false, + "note": "" +} +``` + +This is used by: + +- `tools/wait_for_experiments.sh` — skip blocking on `allow_to_fail` failures +- `tools/post_review_to_gitlab.sh` — create/update GitLab issues for allowed failures +- Claude Code's `review-logs` skill — emit `<system-out>` instead of `<failure>` in JUnit XML + +## Using Claude Code with the Launcher + +Claude Code can create a tight feedback loop for model quantization experiments: configure → submit → monitor → diagnose → fix → resubmit — all from the CLI. + +### Setup + +Install Claude Code and ensure the launcher is ready: + +```bash +npm install -g @anthropic-ai/claude-code +cd Model-Optimizer/launcher +git submodule update --init --recursive +``` + +### Workflow: Submit and Monitor + +Ask Claude Code to launch a job and wait for results: + +```text +> Run Qwen3-8B quantization on OCI-HSG and wait for it to finish + +Claude will: +1. Run: uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +2. Monitor with: NEMORUN_HOME=$(pwd) uv run nemo experiment status <id> +3. Fetch logs when done: NEMORUN_HOME=$(pwd) uv run nemo experiment logs <id> 0 +4. Report the MMLU score and pass/fail status +``` + +### Workflow: Diagnose Failures + +When a job fails, ask Claude Code to analyze the logs: + +```text +> /review-logs + +Claude will: +1. Find all experiments in experiments/ +2. Fetch logs via nemo experiment logs +3. Read and analyze error tracebacks +4. Produce a structured report with root cause and suggested fix +5. Write a JUnit XML for CI integration +``` + +### Workflow: Add a New Model + +Ask Claude Code to set up a new model config: + +```text +> Add Llama-3.1-70B quantization config. It needs 2 nodes with 4 GPUs each. + +Claude will: +1. Create Meta/Llama-3.1-70B/megatron_lm_ptq.yaml +2. Set appropriate TP/EP based on model size +3. Reference the correct service script +4. Test with --dryrun to verify the config +``` + +### Workflow: Iterate on Failures + +Claude Code can fix issues and resubmit in a loop: + +```text +> The job failed with CUDA OOM. Try reducing the sequence length to 4096 and resubmit. + +Claude will: +1. Edit the YAML config +2. Resubmit with uv run launch.py --yaml <config> --yes +3. Monitor and report results +``` + +### Workflow: Reproduce and Compare + +Use `--to-yaml` to capture configs and compare runs: + +```text +> Dump the resolved config for Qwen3-8B, then run it on both OCI-HSG and CW-DFW + +Claude will: +1. Dump: uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml +2. Run on OCI-HSG: SLURM_CLUSTER=oci_hsg uv run slurm.py --yaml resolved.yaml --yes +3. Run on CW-DFW: SLURM_CLUSTER=cw_dfw uv run slurm.py --yaml resolved.yaml --yes +4. Compare MMLU results +``` + +### Skills + +The following Claude Code skills are available in the nmm-sandbox project: + +| Skill | Trigger | Description | +|---|---|---| +| `/review-logs` | After job completion or failure | Analyze experiment logs, diagnose failures, produce JUnit XML | +| `/wait-for-jobs` | After detached submission | Poll experiment status until all jobs finish | +| `/eagle3-new-model` | Adding a new EAGLE3 model | Generate pipeline YAML for a new model | + +### CI Integration + +In CI, Claude Code runs automatically after each test job to: + +1. Fetch and analyze all experiment logs +2. Generate `claude_analysis.md` with structured findings +3. Write `claude_review_rspec.xml` for GitLab test reporting +4. Post failure summaries as MR comments (via `tools/post_review_to_gitlab.sh`) +5. Create/update GitLab issues for `allow_to_fail` jobs that are consistently failing + +If the main script crashes before the review runs, an `after_script` fallback posts the captured job output to the MR so failures are always visible. diff --git a/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml new file mode 100644 index 000000000..19b6cc0d2 --- /dev/null +++ b/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml @@ -0,0 +1,111 @@ +# EAGLE3 offline speculative decoding pipeline for Qwen3-8B. +# +# 4-step pipeline: +# task_0: Data synthesis — query TRT-LLM server to generate prompt samples +# task_1: Dump hidden states — run target model to capture hidden states +# task_2: Offline training — train the EAGLE3 draft head +# task_3: Benchmark — evaluate speculative decoding speedup via VLLM +# +# All tasks share /scratchspace to pass artifacts between steps. +# +# Usage: +# uv run launch.py --yaml Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes +# uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes + +job_name: Qwen3-8B_EAGLE3_offline +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + # Step 1: Data synthesis via TRT-LLM server + # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py. + task_0: + script: common/tensorrt-llm/query.sh + args: + - --model <<global_vars.hf_model>> + - --tp_size 4 + - --ep_size 4 + - --max_num_tokens 32000 + - --port 8000 + - --host 0.0.0.0 + - --trust_remote_code + - -- + - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples + - --save /scratchspace/data + environment: + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 2: Dump hidden states from target model + task_1: + script: common/eagle3/dump_offline_data.sh + args: + - --input-data /scratchspace/data + - --output-dir /scratchspace/offline_hidden_states + - --max-seq-len 8192 + - --tp 4 + - --moe-ep 4 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 3: Train EAGLE3 draft head (offline, single task) + task_2: + script: common/eagle3/offline_training.sh + args: + - --offline-data /scratchspace/offline_hidden_states + - --data_path None + - --mode eagle3 + - --num_epochs 1 + - --lr 3e-4 + - --save_steps 500000 + - --output_dir /scratchspace/eagle3 + - --train_bs 8 + - --training_seq_len 4096 + - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json + - --disable_tqdm True + - --ar_validate_steps 500000 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 4: Benchmark speculative decoding (VLLM backend) + task_3: + script: common/specdec_bench/quick_check.sh + args: + - --draft_model_dir /scratchspace/export + - --draft_length 3 + - --output_length 4096 + - --engine VLLM + - --tp_size 4 + - --ep_size 1 + - --speculative_algorithm EAGLE3 + - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl + - --concurrency 1 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest diff --git a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml index 83ed7f4f0..ce7f81224 100644 --- a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml +++ b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -5,7 +5,7 @@ pipeline: note: task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh args: - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail - --calib-size 32 diff --git a/launcher/common/eagle3/dump_offline_data.sh b/launcher/common/eagle3/dump_offline_data.sh new file mode 100644 index 000000000..a11f7f7ed --- /dev/null +++ b/launcher/common/eagle3/dump_offline_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +trtllm-llmapi-launch python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py \ + --model ${HF_MODEL_CKPT} \ + --dp-rank ${TASK_ID} \ + --dp-world-size ${TASK_COUNT} \ + ${@} diff --git a/launcher/common/eagle3/offline_training.sh b/launcher/common/eagle3/offline_training.sh new file mode 100644 index 000000000..4dfe2de7c --- /dev/null +++ b/launcher/common/eagle3/offline_training.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/service_utils.sh + +pip install -r modules/Model-Optimizer/examples/speculative_decoding/requirements.txt +pip install huggingface-hub>=1.2.1 +export PATH=$PATH:/workspace/.local/bin + +################################################################################################### + +trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER + +bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \ + --model ${HF_MODEL_CKPT} \ + ${@} + +python modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \ + --model_path /scratchspace/eagle3 \ + --export_path /scratchspace/export + +################################################################################################### + +# This function handles the exit status (fails the CI). +#exit_handler $0 diff --git a/launcher/services/megatron-lm/quantize/quantize.sh b/launcher/common/megatron-lm/quantize/quantize.sh similarity index 100% rename from launcher/services/megatron-lm/quantize/quantize.sh rename to launcher/common/megatron-lm/quantize/quantize.sh diff --git a/launcher/common/query.py b/launcher/common/query.py new file mode 100644 index 000000000..79ec93f54 --- /dev/null +++ b/launcher/common/query.py @@ -0,0 +1,147 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D100,D101,D102,D103,D107,F841,PLR1722 +import argparse +import os + +from datasets import load_dataset +from openai import OpenAI + +early_termination = False + + +class LLM: + def __init__(self, args): + self.args = args + self.client = OpenAI(base_url=args.base_url) + self.generate(messages=[{"role": "user", "content": "Hello! /no_think"}], verbose=True) + + def generate(self, messages, verbose=False, **chat_template_kwargs): + try: + completion = self.client.chat.completions.create( + model=self.args.model, + messages=messages, + temperature=self.args.temperature, + ) + new_message = completion.choices[0].message.content + if verbose: + for msg in messages: + print("[OLD] {:10}: {:64}".format(msg["role"], msg["content"])) + print("[NEW] {:10}: {:64}\n\n".format("assistant", new_message)) + + new_message = {"role": "assistant", "content": new_message} + except Exception as e: + print(e) + + if "Connection error" in str(e): + early_termination = True + + new_message = None + + return new_message + + +parser = argparse.ArgumentParser(prog="query") +parser.add_argument("base_url", type=str, help="url to the OpenAI compatible API.") +parser.add_argument("model", type=str, help="model name") +parser.add_argument( + "--data", type=str, default=None, help="path to OAI chat data (local or HF hub)" +) +parser.add_argument("--data-split", type=str, default="train", help="HF dataset split") +parser.add_argument("--save", type=str, default=None, help="path to store the generated output.") +parser.add_argument("--num-shards", type=int, default=1000, help="number of shards.") +parser.add_argument("--shard-id-begin", type=int, default=0, help="the shard id to start.") +parser.add_argument( + "--shard-id-step", type=int, default=1, help="the step that the shard id progress." +) +parser.add_argument("--num-proc", type=int, default=32, help="number of processes (concurrency).") +parser.add_argument("--temperature", type=float, default=0.0, help="temperature.") +args = parser.parse_args() + +llm = LLM(args) + +if args.data is None: + exit(0) + + +def disable_thinking_column(data): + data.update({"enable_thinking": False}) + return data + + +def synthesize(data): + messages = data.get("conversations", None) + if messages is None: + messages = data.get("messages", None) + if messages is None: + raise ValueError( + "No conversations of messages in the data. Only OAI chat data is supported." + ) + + # Handle generation specific kwargs. + enable_thinking = data.get("enable_thinking", True) + + current_messages = [] + + for msg in messages: + if msg["role"] == "system": + current_messages.append(msg) + elif msg["role"] == "user": + if not enable_thinking: + msg["content"] = msg["content"] + " /no_think" + + current_messages.append(msg) + new_message = llm.generate(current_messages, verbose=False) + if new_message is None: + break + else: + current_messages.append(new_message) + elif msg["role"] == "assistant": + # Original assistant messages are not used + pass + else: + raise ValueError("unknown role: {}".format(msg["role"])) + + return {"conversations": current_messages} + + +dataset = load_dataset(args.data, split=args.data_split) + +if args.num_shards * 100 > len(dataset): + args.num_shards = min(16, len(dataset) // 100) + +if args.save is not None: + print("Create save dir: {}".format(args.save)) + os.makedirs(args.save, exist_ok=True) + +for shard_id in range(args.shard_id_begin, args.num_shards, args.shard_id_step): + file_path = args.save + "/train-{:05}-{:05}.jsonl".format(shard_id + 1, args.num_shards) + + if os.path.exists(file_path): + continue + + shard = dataset.shard(num_shards=args.num_shards, index=shard_id) + print(len(shard), file_path) + + if shard_id % 2 == 0: + shard = shard.map(disable_thinking_column, num_proc=args.num_proc) + updated_shard = shard.map(synthesize, num_proc=args.num_proc) + updated_shard.to_json(file_path) + print(updated_shard[0]) + + if early_termination: + print("Terminate earlier due to server connection error!") + break diff --git a/launcher/services/service_utils.sh b/launcher/common/service_utils.sh similarity index 100% rename from launcher/services/service_utils.sh rename to launcher/common/service_utils.sh diff --git a/launcher/common/specdec_bench/quick_check.sh b/launcher/common/specdec_bench/quick_check.sh new file mode 100644 index 000000000..d90413969 --- /dev/null +++ b/launcher/common/specdec_bench/quick_check.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### + + +${TRTLLM_LAUNCH_SCRIPT} python3 modules/Model-Optimizer/examples/specdec_bench/run.py \ + --model_dir ${HF_MODEL_CKPT} \ + --tokenizer ${HF_MODEL_CKPT} \ + ${@} diff --git a/launcher/common/tensorrt-llm/query.sh b/launcher/common/tensorrt-llm/query.sh new file mode 100644 index 000000000..3bc2ec106 --- /dev/null +++ b/launcher/common/tensorrt-llm/query.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### +# Usage: +# query.sh --model MODEL [SERVE_ARGS...] -- [QUERY_ARGS...] +# +# Launches trtllm-serve with the given model, waits for it to be ready, +# then runs common/query.py against the server. +# +# --model MODEL is required and is consumed by this script. It is used as the +# positional model argument for both trtllm-serve and common/query.py. +# +# Remaining arguments are split on "--": +# - Args BEFORE "--" are appended to the trtllm-serve command (SERVE_ARGS). +# - Args AFTER "--" are passed to common/query.py (QUERY_ARGS). +# - If "--" is absent, all remaining args go to common/query.py. +# +# Environment variables (optional, set by Slurm): +# SLURM_ARRAY_TASK_ID Used to shard query.py work across array jobs. +# SLURM_ARRAY_TASK_COUNT Total number of array tasks for sharding. +# +# In a pipeline YAML task config: +# args: +# - --model /hf-local/Qwen/Qwen3-8B # required +# - --tp_size 4 # trtllm-serve args (before --) +# - --ep_size 4 +# - --max_num_tokens 32000 +# - --port 8000 +# - --host 0.0.0.0 +# - --trust_remote_code +# - -- # separator +# - --data /hf-local/dataset # query.py args (after --) +# - --save /scratchspace/data +################################################################################################### + +export OPENAI_API_KEY="token-abc123" + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +# Parse --model and split remaining args on "--". +# --model is consumed here; args before "--" go to trtllm-serve, args after go to query.py. +MODEL="" +SERVE_EXTRA_ARGS=() +QUERY_ARGS=(--shard-id-begin ${TASK_ID} --shard-id-step ${TASK_COUNT}) +past_separator=false +skip_next=false + +for arg in "$@"; do + if $skip_next; then + MODEL="$arg" + skip_next=false + elif [ "$arg" = "--model" ]; then + skip_next=true + elif [ "$arg" = "--" ]; then + past_separator=true + elif [ "$past_separator" = false ]; then + SERVE_EXTRA_ARGS+=("$arg") + else + QUERY_ARGS+=("$arg") + fi +done + +trtllm-llmapi-launch trtllm-serve \ + ${MODEL} \ + "${SERVE_EXTRA_ARGS[@]}" \ + & + + +# Wait for server to start up by polling the health endpoint +echo "Waiting for server to start..." +while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [ "$response" -eq 200 ]; then + echo "Server is up!" + break + fi + echo "Server not ready yet, retrying in 10 seconds..." + sleep 10 +done + +if [[ "$mpi_rank" -eq 0 ]]; then + cmd="python common/query.py http://localhost:8000/v1 ${MODEL} ${QUERY_ARGS[*]}" + echo "Running command: $cmd" + eval $cmd + echo "Main process exit" +else + while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [[ "$response" -ne 200 ]]; then + break + fi + #echo "Server is up!" + sleep 60 + done +fi + +pkill trtllm-serve + +exit 0 diff --git a/launcher/common/vllm/query.sh b/launcher/common/vllm/query.sh new file mode 100755 index 000000000..d203e8994 --- /dev/null +++ b/launcher/common/vllm/query.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### +# Usage: +# query.sh --model MODEL [SERVE_ARGS...] -- [QUERY_ARGS...] +# +# Launches vllm serve with the given model, waits for it to be ready, +# then runs common/query.py against the server. +# +# --model MODEL is required and is consumed by this script. It is used as the +# positional model argument for both vllm serve and common/query.py. +# +# Remaining arguments are split on "--": +# - Args BEFORE "--" are appended to the vllm serve command (SERVE_ARGS). +# - Args AFTER "--" are passed to common/query.py (QUERY_ARGS). +# - If "--" is absent, all remaining args go to common/query.py. +# +# Environment variables (optional, set by Slurm): +# SLURM_ARRAY_TASK_ID Used to shard query.py work across array jobs. +# SLURM_ARRAY_TASK_COUNT Total number of array tasks for sharding. +# +# vLLM notes: +# - vLLM manages GPU distribution internally; run with ntasks_per_node: 1 +# in slurm_config and pass --tensor-parallel-size to match gpus_per_node. +# - NVFP4 models require vllm/vllm-openai:v0.15.0+ on Blackwell GPUs. +# - Use --trust-remote-code for models with custom architectures (e.g. Kimi). +# +# In a pipeline YAML task config: +# args: +# - --model /hf-local/Qwen/Qwen3-8B # required +# - --tensor-parallel-size 4 # vllm serve args (before --) +# - --max-num-seqs 32 +# - --trust-remote-code +# - -- # separator +# - --data /hf-local/dataset # query.py args (after --) +# - --save /scratchspace/data +# slurm_config: +# ntasks_per_node: 1 # vLLM is single-process +# gpus_per_node: 4 +################################################################################################### + +export OPENAI_API_KEY="token-abc123" + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +# Parse --model and split remaining args on "--". +# --model is consumed here; args before "--" go to vllm serve, args after go to query.py. +MODEL="" +SERVE_EXTRA_ARGS=() +QUERY_ARGS=(--shard-id-begin ${TASK_ID} --shard-id-step ${TASK_COUNT}) +past_separator=false +skip_next=false + +for arg in "$@"; do + if $skip_next; then + MODEL="$arg" + skip_next=false + elif [ "$arg" = "--model" ]; then + skip_next=true + elif [ "$arg" = "--" ]; then + past_separator=true + elif [ "$past_separator" = false ]; then + SERVE_EXTRA_ARGS+=("$arg") + else + QUERY_ARGS+=("$arg") + fi +done + +# vLLM is single-process: GPU parallelism is handled internally via --tensor-parallel-size. +# No MPI multi-rank logic needed; this script always runs as a single task. +vllm serve \ + ${MODEL} \ + "${SERVE_EXTRA_ARGS[@]}" \ + & +SERVER_PID=$! + + +# Wait for server to start up by polling the health endpoint +echo "Waiting for server to start..." +while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [ "$response" -eq 200 ]; then + echo "Server is up!" + break + fi + echo "Server not ready yet, retrying in 10 seconds..." + sleep 10 +done + +cmd="python common/query.py http://localhost:8000/v1 ${MODEL} ${QUERY_ARGS[*]}" +echo "Running command: $cmd" +eval $cmd +echo "Main process exit" + +kill $SERVER_PID +wait $SERVER_PID 2>/dev/null || true + +exit 0 diff --git a/launcher/launch.py b/launcher/launch.py index 7251effd1..5b90d9acf 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -57,8 +57,9 @@ "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/examples/*", "services/*", + "common/*", ], - relative_path=[LAUNCHER_DIR] * 6, + relative_path=[LAUNCHER_DIR] * 7, ) MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") From 22b5267ef6fff5710c1541277b02c87eace1cb26 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sat, 14 Mar 2026 19:44:28 -0700 Subject: [PATCH 06/12] add: unit tests for launcher (64 tests, all passing) Add tests/unit/launcher/ with 7 test files covering core dataclasses, factory registry, global_vars, env merging, YAML formats, Docker executor mounts, Slurm executor params (mocked), and end-to-end Docker launch via subprocess. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- tests/unit/launcher/__init__.py | 31 ++ tests/unit/launcher/conftest.py | 54 +++ tests/unit/launcher/test_core.py | 243 +++++++++++++ tests/unit/launcher/test_core_extended.py | 352 +++++++++++++++++++ tests/unit/launcher/test_docker_execution.py | 331 +++++++++++++++++ tests/unit/launcher/test_docker_launch.py | 124 +++++++ tests/unit/launcher/test_slurm_config.py | 118 +++++++ tests/unit/launcher/test_slurm_executor.py | 230 ++++++++++++ tests/unit/launcher/test_yaml_formats.py | 193 ++++++++++ 9 files changed, 1676 insertions(+) create mode 100644 tests/unit/launcher/__init__.py create mode 100644 tests/unit/launcher/conftest.py create mode 100644 tests/unit/launcher/test_core.py create mode 100644 tests/unit/launcher/test_core_extended.py create mode 100644 tests/unit/launcher/test_docker_execution.py create mode 100644 tests/unit/launcher/test_docker_launch.py create mode 100644 tests/unit/launcher/test_slurm_config.py create mode 100644 tests/unit/launcher/test_slurm_executor.py create mode 100644 tests/unit/launcher/test_yaml_formats.py diff --git a/tests/unit/launcher/__init__.py b/tests/unit/launcher/__init__.py new file mode 100644 index 000000000..7c9dc907f --- /dev/null +++ b/tests/unit/launcher/__init__.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the ModelOpt Launcher. + +Coverage: + - test_core.py: Shared dataclasses, factory registry, global_vars interpolation, + version reporting, default env generation, and the run_jobs loop (mocked). + - test_slurm_config.py: SlurmConfig dataclass defaults and slurm_factory behavior + with environment variable overrides. + - test_yaml_formats.py: YAML parsing for --yaml format, pipeline=@ format, and + task_configs resolution via registered factories. + +Not covered (requires live infrastructure): + - Actual Slurm job submission (SSH tunnel, sbatch) + - Docker container launch + - nemo experiment status/logs polling + - PatternPackager tar.gz creation and rsync +""" diff --git a/tests/unit/launcher/conftest.py b/tests/unit/launcher/conftest.py new file mode 100644 index 000000000..d19ced583 --- /dev/null +++ b/tests/unit/launcher/conftest.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fixtures for launcher unit tests. + +These tests can be run standalone without installing modelopt: + cd Model-Optimizer/launcher + uv pip install pytest + uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --rootdir=. +""" + +import os +import sys + +import pytest + +# Prevent pytest from loading the root conftest.py (which imports torch/modelopt) +collect_ignore_glob = ["../../conftest.py"] + + +@pytest.fixture(autouse=True) +def add_launcher_to_path(): + """Add the launcher directory to sys.path so core.py and slurm_config.py can be imported.""" + launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") + launcher_dir = os.path.abspath(launcher_dir) + if launcher_dir not in sys.path: + sys.path.insert(0, launcher_dir) + yield + if launcher_dir in sys.path: + sys.path.remove(launcher_dir) + + +@pytest.fixture +def tmp_yaml(tmp_path): + """Helper to write a YAML file and return its path.""" + + def _write(content, name="test.yaml"): + p = tmp_path / name + p.write_text(content) + return str(p) + + return _write diff --git a/tests/unit/launcher/test_core.py b/tests/unit/launcher/test_core.py new file mode 100644 index 000000000..69c0fc40d --- /dev/null +++ b/tests/unit/launcher/test_core.py @@ -0,0 +1,243 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for launcher/core.py — shared dataclasses, factory registry, and utilities. + +Coverage: + - SandboxTask: dataclass fields and defaults, skip flag + - SandboxPipeline: task slot collection, task_configs resolution, global_vars interpolation + - Factory registry: register_factory, lookup in create_task_from_yaml + - set_slurm_config_type: patches SandboxTask annotation + - get_default_env: returns correct env dicts for a given experiment title + - report_versions: runs without error on a git repo +""" + +import os + + +class TestSandboxTask: + """Tests for the SandboxTask dataclass.""" + + def test_defaults(self): + from core import SandboxTask + + task = SandboxTask() + assert task.script is None + assert task.slurm_config is None + assert task.args is None + assert task.environment is None + assert task.skip is False + + def test_with_values(self): + from core import SandboxTask + + task = SandboxTask( + script="test.sh", + args=["--foo", "bar"], + environment=[{"KEY": "val"}], + skip=True, + ) + assert task.script == "test.sh" + assert task.args == ["--foo", "bar"] + assert task.environment == [{"KEY": "val"}] + assert task.skip is True + + +class TestSandboxPipeline: + """Tests for SandboxPipeline task collection and global_vars interpolation.""" + + def test_task_slots_collected(self): + from core import SandboxPipeline, SandboxTask0, SandboxTask1 + + t0 = SandboxTask0(script="a.sh") + t1 = SandboxTask1(script="b.sh") + pipeline = SandboxPipeline(task_0=t0, task_1=t1) + assert len(pipeline.tasks) == 2 + assert pipeline.tasks[0].script == "a.sh" + assert pipeline.tasks[1].script == "b.sh" + + def test_empty_pipeline(self): + from core import SandboxPipeline + + pipeline = SandboxPipeline() + assert pipeline.tasks == [] + + def test_global_vars_interpolation_in_environment(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + environment=[{"MODEL": "<<global_vars.hf_model>>"}], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/hf-local/Qwen/Qwen3-8B"), + ) + assert pipeline.tasks[0].environment == [{"MODEL": "/hf-local/Qwen/Qwen3-8B"}] + + def test_global_vars_interpolation_in_args(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["--model", "<<global_vars.hf_model>>"], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/models/llama"), + ) + assert pipeline.tasks[0].args == ["--model", "/models/llama"] + + def test_global_vars_unresolved_passthrough(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["<<global_vars.nonexistent>>"], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/models/llama"), + ) + # Unresolved references are left as-is + assert pipeline.tasks[0].args == ["<<global_vars.nonexistent>>"] + + def test_skip_and_allow_to_fail(self): + from core import SandboxPipeline + + pipeline = SandboxPipeline(skip=True, allow_to_fail=True, note="test note") + assert pipeline.skip is True + assert pipeline.allow_to_fail is True + assert pipeline.note == "test note" + + +class TestFactoryRegistry: + """Tests for register_factory and its use in create_task_from_yaml.""" + + def test_register_and_lookup(self, tmp_yaml): + from core import _FACTORY_REGISTRY, register_factory + + # Register a mock factory + def mock_factory(nodes=1, **kwargs): + return {"nodes": nodes, "factory": "mock"} + + register_factory("mock_factory", mock_factory) + assert "mock_factory" in _FACTORY_REGISTRY + assert _FACTORY_REGISTRY["mock_factory"] is mock_factory + + def test_create_task_from_yaml_uses_registry(self, tmp_yaml): + from core import create_task_from_yaml, register_factory + + def test_factory(nodes=1): + return {"nodes": nodes} + + register_factory("test_factory", test_factory) + + yaml_content = """ +script: test.sh +args: + - --flag +slurm_config: + _factory_: "test_factory" + nodes: 2 +""" + path = tmp_yaml(yaml_content) + task = create_task_from_yaml(path, factory_lookup={"test_factory": test_factory}) + assert task.script == "test.sh" + assert task.args == ["--flag"] + assert task.slurm_config == {"nodes": 2} + + def test_task_configs_resolved_via_registry(self, tmp_yaml): + from core import SandboxPipeline, register_factory + + def dummy_factory(nodes=1): + return {"nodes": nodes} + + register_factory("dummy_factory", dummy_factory) + + task_yaml = tmp_yaml( + """ +script: hello.sh +slurm_config: + _factory_: "dummy_factory" + nodes: 3 +""", + name="task.yaml", + ) + pipeline = SandboxPipeline(task_configs=[task_yaml]) + assert len(pipeline.tasks) == 1 + assert pipeline.tasks[0].script == "hello.sh" + assert pipeline.tasks[0].slurm_config == {"nodes": 3} + + +class TestSetSlurmConfigType: + """Tests for set_slurm_config_type annotation patching.""" + + def test_patches_annotation(self): + from dataclasses import dataclass + + from core import SandboxTask, set_slurm_config_type + + @dataclass + class MockSlurmConfig: + host: str = "test" + + set_slurm_config_type(MockSlurmConfig) + assert SandboxTask.__annotations__["slurm_config"] is MockSlurmConfig + assert SandboxTask.__dataclass_fields__["slurm_config"].type is MockSlurmConfig + + +class TestGetDefaultEnv: + """Tests for get_default_env utility.""" + + def test_default_title(self): + from core import get_default_env + + slurm_env, local_env = get_default_env() + assert slurm_env["TRITON_CACHE_DIR"] == "/cicd/triton-cache" + assert slurm_env["HF_HOME"] == "/cicd/hf-cache" + assert slurm_env["MLM_SKIP_INSTALL"] == "1" + assert "LAUNCH_SCRIPT" in slurm_env + assert local_env["TRITON_CACHE_DIR"] == "/cicd/triton-cache" + assert "LAUNCH_SCRIPT" not in local_env + + def test_custom_title(self): + from core import get_default_env + + slurm_env, local_env = get_default_env("modelopt") + assert slurm_env["TRITON_CACHE_DIR"] == "/modelopt/triton-cache" + assert slurm_env["HF_HOME"] == "/modelopt/hf-cache" + assert local_env["HF_HOME"] == "/modelopt/hf-cache" + + +class TestReportVersions: + """Tests for report_versions git info utility.""" + + def test_runs_on_repo(self, capsys): + from core import report_versions + + # Should not raise — runs git on the current repo + report_versions(os.getcwd()) + captured = capsys.readouterr() + assert "Version Report" in captured.out + + def test_runs_on_nonexistent_dir(self, capsys): + from core import report_versions + + # Should handle gracefully — "unknown" for non-git dirs + report_versions("/tmp/nonexistent_dir_12345") + captured = capsys.readouterr() + assert "Version Report" in captured.out + assert "unknown" in captured.out diff --git a/tests/unit/launcher/test_core_extended.py b/tests/unit/launcher/test_core_extended.py new file mode 100644 index 000000000..698c5b438 --- /dev/null +++ b/tests/unit/launcher/test_core_extended.py @@ -0,0 +1,352 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Extended tests for launcher/core.py — edge cases and remaining coverage gaps. + +Coverage: + - create_task_from_yaml: error cases (missing factory, bad YAML) + - SandboxPipeline: dict environment (not list), task_configs with registry fallback + - _git_info: direct tests for success and failure + - run_jobs: environment merging (list vs dict), test_level filtering, pipeline skip, + detach flag, version report +""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + + +class TestCreateTaskFromYamlErrors: + """Error handling in create_task_from_yaml.""" + + def test_missing_factory_raises(self, tmp_yaml): + from core import create_task_from_yaml + + yaml_content = """ +script: test.sh +slurm_config: + _factory_: "nonexistent_factory" + nodes: 1 +""" + path = tmp_yaml(yaml_content) + with pytest.raises(KeyError): + create_task_from_yaml(path, factory_lookup={}) + + def test_missing_slurm_config_raises(self, tmp_yaml): + from core import create_task_from_yaml + + yaml_content = """ +script: test.sh +""" + path = tmp_yaml(yaml_content) + with pytest.raises((KeyError, TypeError)): + create_task_from_yaml(path, factory_lookup={}) + + def test_environment_preserved(self, tmp_yaml): + from core import create_task_from_yaml + + def factory(nodes=1): + return {"nodes": nodes} + + yaml_content = """ +script: test.sh +environment: + - KEY1: val1 + - KEY2: val2 +slurm_config: + _factory_: "f" + nodes: 1 +""" + path = tmp_yaml(yaml_content) + task = create_task_from_yaml(path, factory_lookup={"f": factory}) + assert task.environment == [{"KEY1": "val1"}, {"KEY2": "val2"}] + + +class TestSandboxPipelineExtended: + """Extended SandboxPipeline tests.""" + + def test_dict_environment_interpolation(self): + """Global vars resolve in dict-format environment (not list).""" + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + environment={"MODEL": "<<global_vars.hf_model>>", "STATIC": "value"}, + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/hf-local/model"), + ) + assert pipeline.tasks[0].environment == { + "MODEL": "/hf-local/model", + "STATIC": "value", + } + + def test_tasks_list_directly(self): + """Pipeline can receive tasks as a list directly.""" + from core import SandboxPipeline, SandboxTask + + tasks = [ + SandboxTask(script="a.sh"), + SandboxTask(script="b.sh"), + SandboxTask(script="c.sh"), + ] + pipeline = SandboxPipeline(tasks=tasks) + assert len(pipeline.tasks) == 3 + assert pipeline.tasks[2].script == "c.sh" + + def test_no_global_vars_no_error(self): + """Pipeline without global_vars doesn't crash on interpolation.""" + from core import SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["<<global_vars.hf_model>>"], + ) + pipeline = SandboxPipeline(task_0=t0) + # No interpolation happens — args kept as-is + assert pipeline.tasks[0].args == ["<<global_vars.hf_model>>"] + + +class TestGitInfo: + """Direct tests for _git_info helper.""" + + def test_valid_git_repo(self): + from core import _git_info + + commit, branch = _git_info(os.getcwd()) + assert commit != "unknown" + assert branch != "unknown" + assert len(commit) >= 7 # short hash + + def test_nonexistent_directory(self): + from core import _git_info + + commit, branch = _git_info("/tmp/nonexistent_xyz_12345") + assert commit == "unknown" + assert branch == "unknown" + + def test_non_git_directory(self): + from core import _git_info + + # Use /tmp which is outside any git repo + commit, branch = _git_info("/tmp") + # /tmp may or may not be inside a git worktree depending on the system + # Just verify it returns strings without crashing + assert isinstance(commit, str) + assert isinstance(branch, str) + + +class TestRunJobsExtended: + """Extended run_jobs tests for env merging, test_level, and detach.""" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_environment_list_merged_to_env(self, mock_docker, mock_exp, tmp_path): + """List-of-dicts environment is merged into task_env.""" + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_env" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0( + script="test.sh", + slurm_config=MagicMock(), + environment=[{"A": "1"}, {"B": "2"}], + ) + pipeline = SandboxPipeline(task_0=t0) + + with patch("core.run.Script") as mock_script: + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + # Script called with merged env + call_kwargs = mock_script.call_args[1] + assert "A" in call_kwargs["env"] + assert "B" in call_kwargs["env"] + assert call_kwargs["env"]["A"] == "1" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_none_env_values_converted_to_empty_string(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_none" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0( + script="test.sh", + slurm_config=MagicMock(), + environment=[{"KEY": None}], + ) + pipeline = SandboxPipeline(task_0=t0) + + with patch("core.run.Script") as mock_script: + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + env = mock_script.call_args[1]["env"] + assert env["KEY"] == "" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_test_level_filters_pipeline(self, mock_docker, mock_exp, tmp_path): + """Pipelines with test_level > current are skipped.""" + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_lvl" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, test_level=2) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + test_level=0, # lower than pipeline's test_level=2 + base_dir=str(tmp_path), + ) + + # Experiment should not be created for skipped pipelines + mock_exp.assert_not_called() + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_skipped_pipeline_not_run(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, skip=True) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + + mock_exp.assert_not_called() + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_detach_flag_passed_to_experiment(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_detach" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + detach=True, + base_dir=str(tmp_path), + ) + + mock_exp_inst.run.assert_called_once_with(detach=True) + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_version_report_called(self, mock_docker, mock_exp, tmp_path, capsys): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_ver" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + + captured = capsys.readouterr() + assert "Version Report" in captured.out diff --git a/tests/unit/launcher/test_docker_execution.py b/tests/unit/launcher/test_docker_execution.py new file mode 100644 index 000000000..693071bb3 --- /dev/null +++ b/tests/unit/launcher/test_docker_execution.py @@ -0,0 +1,331 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Docker execution path — verifies build_docker_executor and run_jobs with mocked Docker. + +Coverage: + - build_docker_executor: container mounts, scratch dir creation, modelopt mount + - run_jobs with hf_local: Docker path selected, env vars applied, metadata written + - --yaml format end-to-end: YAML parsed, pipeline constructed, executor built +""" + +import json +import os +from unittest.mock import MagicMock, patch + + +class TestBuildDockerExecutor: + """Tests for build_docker_executor mount and directory setup.""" + + def test_scratch_dir_created(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + build_docker_executor( + hf_local="/tmp/hf-local", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + scratch_dir = os.path.join(job_dir, "cicd", "exp_123", "task_0") + assert os.path.isdir(scratch_dir) + + def test_hf_local_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/my/hf-local", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/my/hf-local:/hf-local" in v for v in volumes) + + def test_scratchspace_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_456", + job_dir=job_dir, + task_name="job_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + expected_scratch = os.path.join(job_dir, "cicd", "exp_456", "job_0") + assert any(f"{expected_scratch}:/scratchspace" in v for v in volumes) + + def test_modelopt_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_789", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/custom/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/custom/modelopt:/opt/modelopt" in v for v in volumes) + + def test_experiment_title_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="modelopt", + ) + volumes = executor.volumes + exp_title_path = os.path.join(job_dir, "modelopt") + assert any(f"{exp_title_path}:/modelopt" in v for v in volumes) + + def test_local_slurm_config_mounts_preserved(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=True, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=["/data:/data", "/models:/models"], + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/data:/data" in v for v in volumes) + assert any("/models:/models" in v for v in volumes) + + +class TestRunJobsDockerPath: + """Tests for run_jobs selecting Docker path when hf_local is set.""" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_docker_executor_called_with_hf_local(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_001" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0( + script="echo hello", + slurm_config=MagicMock(), + ) + pipeline = SandboxPipeline(task_0=t0) + job_table = {"test_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf-local", + user="testuser", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + mock_docker.assert_called_once() + call_kwargs = mock_docker.call_args + assert call_kwargs[0][0] == "/tmp/hf-local" # hf_local + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_metadata_written(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_meta" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, allow_to_fail=True, note="test note") + job_table = {"meta_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf", + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + metadata_path = os.path.join("experiments", "cicd", "test_exp_meta", "metadata.json") + assert os.path.exists(metadata_path) + with open(metadata_path) as f: + meta = json.load(f) + assert meta["experiment_id"] == "test_exp_meta" + assert meta["job_name"] == "meta_job" + assert meta["allow_to_fail"] is True + assert meta["note"] == "test note" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_skipped_task_not_submitted(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, SandboxTask1, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_skip" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="run.sh", slurm_config=MagicMock(), skip=True) + t1 = SandboxTask1(script="eval.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, task_1=t1) + job_table = {"skip_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf", + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + # Only task_1 should be submitted (task_0 is skipped) + assert mock_docker.call_count == 1 + + @patch("core.run.Experiment") + @patch("core.build_slurm_executor") + def test_slurm_executor_called_without_hf_local(self, mock_slurm, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_slurm" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_slurm.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="train.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + job_table = {"slurm_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local=None, # No hf_local → Slurm path + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + mock_slurm.assert_called_once() diff --git a/tests/unit/launcher/test_docker_launch.py b/tests/unit/launcher/test_docker_launch.py new file mode 100644 index 000000000..8baad32c8 --- /dev/null +++ b/tests/unit/launcher/test_docker_launch.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration test for Docker container launch via run_jobs. + +Requires Docker to be installed and running. Uses python:3.12-slim +(lightweight, no GPU needed) to run a trivial script. + +Run with: pytest -s (stdin capture must be disabled for invoke/fabric) +""" + +import os +import shutil +import subprocess + +import pytest + +docker_available = shutil.which("docker") is not None + + +@pytest.mark.skipif(not docker_available, reason="Docker not available") +class TestDockerLaunch: + """End-to-end Docker launch test using subprocess to avoid pytest stdin capture issues.""" + + def test_echo_script_via_launch(self, tmp_path): + """Launch a Docker container via launch.py subprocess that runs 'echo hello'.""" + # Create a trivial script + script_dir = tmp_path / "scripts" + script_dir.mkdir() + script = script_dir / "hello.sh" + script.write_text("#!/bin/bash\necho 'HELLO_FROM_DOCKER'\n") + script.chmod(0o755) + + # Create a YAML config + yaml_content = """ +job_name: test_hello +pipeline: + task_0: + script: scripts/hello.sh + slurm_config: + _factory_: "slurm_factory" + container: python:3.12-slim +""" + yaml_path = tmp_path / "test.yaml" + yaml_path.write_text(yaml_content) + + # Run launch.py as a subprocess (avoids pytest stdin capture issues) + launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") + launcher_dir = os.path.abspath(launcher_dir) + + result = subprocess.run( + [ + "uv", + "run", + "launch.py", + "--yaml", + str(yaml_path), + f"hf_local={tmp_path}", + "--yes", + ], + cwd=launcher_dir, + capture_output=True, + text=True, + timeout=300, + ) + + # Check output + assert "Version Report" in result.stdout + assert "Launching" in result.stdout or "Entering Experiment" in result.stdout + + def test_failing_script_via_launch(self, tmp_path): + """Launch a Docker container that exits 1 — launch.py should not crash.""" + script_dir = tmp_path / "scripts" + script_dir.mkdir() + script = script_dir / "fail.sh" + script.write_text("#!/bin/bash\necho 'FAILING'\nexit 1\n") + script.chmod(0o755) + + yaml_content = """ +job_name: test_fail +pipeline: + task_0: + script: scripts/fail.sh + slurm_config: + _factory_: "slurm_factory" + container: python:3.12-slim +""" + yaml_path = tmp_path / "fail_test.yaml" + yaml_path.write_text(yaml_content) + + launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") + launcher_dir = os.path.abspath(launcher_dir) + + result = subprocess.run( + [ + "uv", + "run", + "launch.py", + "--yaml", + str(yaml_path), + f"hf_local={tmp_path}", + "--yes", + ], + cwd=launcher_dir, + capture_output=True, + text=True, + timeout=300, + ) + + # launch.py should complete (exit 0) even if the job fails + # The job failure is reported in stdout + assert "Version Report" in result.stdout diff --git a/tests/unit/launcher/test_slurm_config.py b/tests/unit/launcher/test_slurm_config.py new file mode 100644 index 000000000..aeb09200e --- /dev/null +++ b/tests/unit/launcher/test_slurm_config.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for launcher/slurm_config.py — SlurmConfig dataclass and factory. + +Coverage: + - SlurmConfig: default values, field types + - slurm_factory: default behavior, env var overrides (SLURM_HOST, SLURM_ACCOUNT, + SLURM_HF_LOCAL), return type +""" + + +class TestSlurmConfig: + """Tests for the SlurmConfig dataclass.""" + + def test_defaults(self): + from slurm_config import SlurmConfig + + cfg = SlurmConfig() + assert cfg.host is None + assert cfg.port == 22 + assert cfg.account is None + assert cfg.partition == "batch" + assert cfg.container is None + assert cfg.nodes == 1 + assert cfg.ntasks_per_node == 1 + assert cfg.gpus_per_node == 1 + assert cfg.local is False + assert cfg.container_mounts is None + assert cfg.srun_args is None + assert cfg.array is None + + def test_custom_values(self): + from slurm_config import SlurmConfig + + cfg = SlurmConfig( + host="login.example.com", + account="my_account", + nodes=4, + gpus_per_node=8, + container="nvcr.io/nvidia/pytorch:24.01-py3", + container_mounts=["/data:/data"], + srun_args=["--no-container-mount-home"], + ) + assert cfg.host == "login.example.com" + assert cfg.account == "my_account" + assert cfg.nodes == 4 + assert cfg.gpus_per_node == 8 + assert cfg.container_mounts == ["/data:/data"] + + +class TestSlurmFactory: + """Tests for the slurm_factory function.""" + + def test_default_returns_slurm_config(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + # slurm_factory with @run.autoconvert returns a nemo-run Config wrapper + assert "SlurmConfig" in repr(cfg) + + def test_default_container(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + assert "tensorrt-llm" in cfg.container + + def test_default_srun_args(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + assert cfg.srun_args == ["--no-container-mount-home"] + + def test_default_container_mounts_from_env(self, monkeypatch): + monkeypatch.setenv("SLURM_HF_LOCAL", "/custom/hf-local") + # Need to re-import to pick up the env var in the default + # The factory reads SLURM_HF_LOCAL at call time via the default arg + import importlib + + import slurm_config + + importlib.reload(slurm_config) + cfg = slurm_config.slurm_factory() + assert any("/custom/hf-local:/hf-local" in m for m in cfg.container_mounts) + + def test_override_nodes(self): + from slurm_config import slurm_factory + + cfg = slurm_factory(nodes=8) + assert cfg.nodes == 8 + + def test_override_partition(self): + from slurm_config import slurm_factory + + cfg = slurm_factory(partition="gpu") + assert cfg.partition == "gpu" + + def test_env_var_host(self, monkeypatch): + monkeypatch.setenv("SLURM_HOST", "test-host.example.com") + import importlib + + import slurm_config + + importlib.reload(slurm_config) + cfg = slurm_config.slurm_factory() + assert cfg.host == "test-host.example.com" diff --git a/tests/unit/launcher/test_slurm_executor.py b/tests/unit/launcher/test_slurm_executor.py new file mode 100644 index 000000000..48004c786 --- /dev/null +++ b/tests/unit/launcher/test_slurm_executor.py @@ -0,0 +1,230 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for build_slurm_executor — container mounts, scratch paths, executor params. + +Note: actual SSH tunnel and sbatch submission are not tested (require live infra). +We mock run.SSHTunnel and run.SlurmExecutor to verify the arguments passed. +""" + +from unittest.mock import MagicMock, patch + + +class TestBuildSlurmExecutor: + """Tests for build_slurm_executor mount construction and executor params.""" + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_scratch_and_modelopt_mounts(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="test-host", + port=22, + account="test_account", + partition="batch", + container="nvcr.io/test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=["/hf-local:/hf-local"], + srun_args=["--no-container-mount-home"], + nodes=1, + ntasks_per_node=4, + gpus_per_node=4, + array=None, + ) + + build_slurm_executor( + user="testuser", + identity=None, + slurm_config=slurm_config, + experiment_id="exp_001", + job_dir="/lustre/experiments", + task_name="job_0", + packager=MagicMock(), + experiment_title="cicd", + ) + + # Check SlurmExecutor was called + mock_executor.assert_called_once() + call_kwargs = mock_executor.call_args[1] + + # Verify container mounts include scratch, modelopt, and experiment title + mounts = call_kwargs["container_mounts"] + assert any("/scratchspace" in m for m in mounts) + assert any("/opt/modelopt" in m for m in mounts) + assert any("/cicd" in m for m in mounts) + # Original mount preserved + assert any("/hf-local:/hf-local" in m for m in mounts) + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_scratch_path_uses_experiment_title(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="host", + port=22, + account="acct", + partition="batch", + container="img", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=[], + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="exp_xyz", + job_dir="/data", + task_name="task_0", + packager=MagicMock(), + experiment_title="modelopt", + ) + + mounts = mock_executor.call_args[1]["container_mounts"] + assert any("/data/modelopt/exp_xyz:/scratchspace" in m for m in mounts) + assert any("/data/modelopt:/modelopt" in m for m in mounts) + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_tunnel_created_with_correct_params(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="login.cluster.com", + port=30022, + account="acct", + partition="batch", + container="img", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=[], + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="myuser", + identity="/home/.ssh/id_rsa", + slurm_config=slurm_config, + experiment_id="exp_1", + job_dir="/job", + task_name="t0", + packager=MagicMock(), + ) + + mock_tunnel.assert_called_once() + tunnel_kwargs = mock_tunnel.call_args[1] + assert tunnel_kwargs["host"] == "login.cluster.com" + assert tunnel_kwargs["user"] == "myuser" + assert tunnel_kwargs["port"] == 30022 + assert tunnel_kwargs["identity"] == "/home/.ssh/id_rsa" + assert tunnel_kwargs["job_dir"] == "/job" + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_executor_params(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="h", + port=22, + account="my_acct", + partition="gpu", + container="nvcr.io/img:v1", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=["--mpi=pmix"], + nodes=2, + ntasks_per_node=8, + gpus_per_node=8, + array="0-3", + ) + + packager = MagicMock() + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="e1", + job_dir="/j", + task_name="t0", + packager=packager, + ) + + kw = mock_executor.call_args[1] + assert kw["account"] == "my_acct" + assert kw["partition"] == "gpu" + assert kw["nodes"] == 2 + assert kw["ntasks_per_node"] == 8 + assert kw["gpus_per_node"] == 8 + assert kw["container_image"] == "nvcr.io/img:v1" + assert kw["srun_args"] == ["--mpi=pmix"] + assert kw["array"] == "0-3" + assert kw["packager"] is packager + assert kw["time"] == "04:00:00" + assert kw["retries"] == 0 + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_none_container_mounts_handled(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="h", + port=22, + account="a", + partition="b", + container="c", + modelopt_install_path="/m", + container_mounts=None, + srun_args=None, + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="e", + job_dir="/j", + task_name="t", + packager=MagicMock(), + ) + + # Should not crash; mounts should still include scratch + modelopt + title + mounts = mock_executor.call_args[1]["container_mounts"] + assert len(mounts) >= 3 diff --git a/tests/unit/launcher/test_yaml_formats.py b/tests/unit/launcher/test_yaml_formats.py new file mode 100644 index 000000000..571535343 --- /dev/null +++ b/tests/unit/launcher/test_yaml_formats.py @@ -0,0 +1,193 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for YAML config parsing — verifies that different YAML formats produce correct dataclasses. + +Coverage: + - --yaml format: top-level job_name + pipeline with task_0, environment, slurm_config + - pipeline=@ format: bare SandboxPipeline without job_name wrapper + - task_configs: list of YAML paths resolved via factory registry + - Environment formats: list-of-dicts and flat dict both parsed correctly + - Global vars: <<global_vars.X>> resolved in both args and environment +""" + +import yaml + + +class TestYamlFormatParsing: + """Tests that YAML content parses into correct dataclass structures.""" + + def test_yaml_format_with_job_name(self, tmp_yaml): + """The --yaml format has job_name and pipeline as top-level keys.""" + content = """ +job_name: test_job +pipeline: + skip: false + allow_to_fail: true + note: "test note" + task_0: + script: test.sh + args: + - --flag + environment: + - KEY: value +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + assert data["job_name"] == "test_job" + assert data["pipeline"]["skip"] is False + assert data["pipeline"]["allow_to_fail"] is True + assert data["pipeline"]["note"] == "test note" + assert data["pipeline"]["task_0"]["script"] == "test.sh" + assert data["pipeline"]["task_0"]["args"] == ["--flag"] + assert data["pipeline"]["task_0"]["environment"] == [{"KEY": "value"}] + + def test_bare_pipeline_format(self, tmp_yaml): + """The pipeline=@ format is a bare SandboxPipeline without wrapper.""" + + content = """ +task_0: + script: a.sh + args: + - --foo +task_1: + script: b.sh +allow_to_fail: false +skip: false +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + # Verify the YAML parses into valid SandboxPipeline kwargs + # (nemo-run does this via its CLI parser; we just verify the structure) + assert "task_0" in data + assert "task_1" in data + assert data["task_0"]["script"] == "a.sh" + assert data["task_1"]["script"] == "b.sh" + + def test_task_configs_format(self, tmp_yaml): + """task_configs lists YAML files that are resolved into tasks.""" + from core import SandboxPipeline, register_factory + + def local_factory(nodes=1): + return {"nodes": nodes} + + register_factory("local_factory", local_factory) + + task_path = tmp_yaml( + """ +script: worker.sh +args: + - --batch-size 32 +slurm_config: + _factory_: "local_factory" + nodes: 2 +""", + name="worker.yaml", + ) + + pipeline = SandboxPipeline(task_configs=[task_path]) + assert len(pipeline.tasks) == 1 + assert pipeline.tasks[0].script == "worker.sh" + assert pipeline.tasks[0].args == ["--batch-size 32"] + assert pipeline.tasks[0].slurm_config == {"nodes": 2} + + def test_environment_list_of_dicts(self): + """Environment as list-of-single-key-dicts (nemo-run format).""" + from core import SandboxTask + + task = SandboxTask( + script="test.sh", + environment=[{"A": "1"}, {"B": "2"}, {"C": "3"}], + ) + assert len(task.environment) == 3 + assert task.environment[0] == {"A": "1"} + + def test_global_vars_across_multiple_tasks(self, tmp_yaml): + """Global vars resolve in both task_0 and task_1.""" + from core import GlobalVariables, SandboxPipeline, SandboxTask0, SandboxTask1 + + t0 = SandboxTask0( + script="quantize.sh", + args=["--model", "<<global_vars.hf_model>>"], + environment=[{"HF_MODEL": "<<global_vars.hf_model>>"}], + ) + t1 = SandboxTask1( + script="eval.sh", + environment=[{"HF_MODEL": "<<global_vars.hf_model>>"}], + ) + pipeline = SandboxPipeline( + task_0=t0, + task_1=t1, + global_vars=GlobalVariables(hf_model="/hf-local/Qwen/Qwen3-8B"), + ) + assert pipeline.tasks[0].args == ["--model", "/hf-local/Qwen/Qwen3-8B"] + assert pipeline.tasks[0].environment == [{"HF_MODEL": "/hf-local/Qwen/Qwen3-8B"}] + assert pipeline.tasks[1].environment == [{"HF_MODEL": "/hf-local/Qwen/Qwen3-8B"}] + + +class TestTestYamlFormat: + """Tests for the test YAML format used by run_test_yaml.sh.""" + + def test_target_with_overrides(self, tmp_yaml): + """Test YAML entries have _target_ and override fields.""" + content = """ +- _target_: path/to/config.yaml + pipeline: + allow_to_fail: true + skip: false + note: "known issue" +- _target_: path/to/other.yaml + pipeline: + allow_to_fail: false +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + assert isinstance(data, list) + assert len(data) == 2 + assert data[0]["_target_"] == "path/to/config.yaml" + assert data[0]["pipeline"]["allow_to_fail"] is True + assert data[0]["pipeline"]["note"] == "known issue" + assert data[1]["_target_"] == "path/to/other.yaml" + assert data[1]["pipeline"]["allow_to_fail"] is False + + def test_flatten_overrides(self): + """Nested overrides flatten to dot-notation for CLI args.""" + entry = { + "pipeline": { + "allow_to_fail": True, + "skip": False, + } + } + + # Simulate the flatten logic from run_test_yaml.sh + overrides = [] + + def flatten(d, prefix=""): + for k, v in d.items(): + key = f"{prefix}{k}" if prefix else k + if isinstance(v, dict): + flatten(v, f"{key}.") + else: + overrides.append(f"{key}={v}") + + flatten(entry) + assert "pipeline.allow_to_fail=True" in overrides + assert "pipeline.skip=False" in overrides From 59cdedea845a3af399f5e37a368fc5a0f0c77907 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sat, 14 Mar 2026 19:50:13 -0700 Subject: [PATCH 07/12] fix: replace Model-Optimizer submodule with symlink to parent Remove self-referential launcher/modules/Model-Optimizer submodule (flagged in PR review as creating recursive nesting). Replace with a symlink to ../.. (the Model-Optimizer root). The packager's find follows symlinks so modelopt/* and examples/* are packaged correctly. Verified: Docker launch with symlink works (quantize step finds modelopt). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- .gitmodules | 3 --- launcher/modules/Model-Optimizer | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) mode change 160000 => 120000 launcher/modules/Model-Optimizer diff --git a/.gitmodules b/.gitmodules index 23a5af209..87630967d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "launcher/modules/Megatron-LM"] path = launcher/modules/Megatron-LM url = https://github.com/AAnoosheh/Megatron-LM.git -[submodule "launcher/modules/Model-Optimizer"] - path = launcher/modules/Model-Optimizer - url = https://github.com/NVIDIA/Model-Optimizer.git diff --git a/launcher/modules/Model-Optimizer b/launcher/modules/Model-Optimizer deleted file mode 160000 index 69c0d4794..000000000 --- a/launcher/modules/Model-Optimizer +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 69c0d47946086d032e665ecf59a9ff28dc32f5b8 diff --git a/launcher/modules/Model-Optimizer b/launcher/modules/Model-Optimizer new file mode 120000 index 000000000..c25bddb6d --- /dev/null +++ b/launcher/modules/Model-Optimizer @@ -0,0 +1 @@ +../.. \ No newline at end of file From bf91e2ba3ba8c391ceb65748bd443df9f1308892 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sat, 14 Mar 2026 21:13:49 -0700 Subject: [PATCH 08/12] chg: docs, gitignore, hf_local global_vars, symlink auto-creation Add launcher/.gitignore, CLAUDE.md. Update README with hf_local docs, test instructions, verified results. Fix ADVANCED.md stale paths. Add hf_local to GlobalVariables. Use <<global_vars.hf_local>> in YAML. Remove stale services/* from packager. quantize.sh reads MMLU_DATASET env var. launch.py auto-creates Model-Optimizer symlink. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- launcher/.gitignore | 22 ++ launcher/ADVANCED.md | 24 +- launcher/CLAUDE.md | 117 ++++++++++ launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml | 20 +- launcher/README.md | 96 ++++++-- .../common/megatron-lm/quantize/quantize.sh | 2 +- launcher/core.py | 1 + launcher/launch.py | 9 +- uv.lock | 217 +++++++++++++++++- 9 files changed, 468 insertions(+), 40 deletions(-) create mode 100644 launcher/.gitignore create mode 100644 launcher/CLAUDE.md diff --git a/launcher/.gitignore b/launcher/.gitignore new file mode 100644 index 000000000..3eb4a4907 --- /dev/null +++ b/launcher/.gitignore @@ -0,0 +1,22 @@ +# Virtual environment +.venv/ + +# nemo-run state +.slurm_jobs +.docker_jobs.json +.local_jobs.json + +# Experiment artifacts (generated at runtime) +experiments/ +local_experiments/ + +# uv lock (generated, not portable) +uv.lock + +# Python cache +__pycache__/ + +# Editor swap files +*.swp +*.swo +*~ diff --git a/launcher/ADVANCED.md b/launcher/ADVANCED.md index fb4bc0256..8698f4ce8 100644 --- a/launcher/ADVANCED.md +++ b/launcher/ADVANCED.md @@ -35,13 +35,25 @@ code/ ├── modules/ │ ├── Megatron-LM/megatron/... # Training framework │ └── Model-Optimizer/modelopt/... # ModelOpt library (mounted over container install) -└── services/ - └── megatron-lm/quantize/ - └── quantize.sh # Job script +└── common/ + ├── megatron-lm/quantize/ + │ └── quantize.sh # PTQ quantization + MMLU + ├── tensorrt-llm/query.sh # TRT-LLM server + query + ├── vllm/query.sh # vLLM server + query + ├── eagle3/ # EAGLE3 pipeline scripts + └── query.py # OpenAI-compatible query client ``` The `modelopt/` directory is bind-mounted over the container's installed ModelOpt, so your local changes take effect without rebuilding the container. +### Model-Optimizer Symlink + +`launcher/modules/Model-Optimizer` is a **symlink** to `../..` (the Model-Optimizer root), not a git submodule. This avoids recursive nesting — the launcher lives inside Model-Optimizer and references its own parent. + +- Git tracks the symlink natively (`git clone` preserves it) +- `launch.py` auto-creates the symlink on first run if it's missing +- The packager's `find` follows symlinks, so `modules/Model-Optimizer/modelopt/*` resolves correctly + ### Factory System Slurm cluster configs use a factory pattern. YAMLs reference a factory by name: @@ -70,7 +82,7 @@ SLURM_CLUSTER=cw_dfw uv run slurm.py --yaml config.yaml --yes job_name: Qwen3-8B_NVFP4 pipeline: task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh slurm_config: _factory_: "slurm_factory" ``` @@ -79,12 +91,12 @@ pipeline: ```yaml task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh slurm_config: _factory_: "slurm_factory" ``` -**Test YAML format** — list of jobs with `_target_` and overrides, used by `tools/run_test_yaml.sh`: +**Test YAML format** — list of jobs with `_target_` and overrides, used by nmm-sandbox's `tools/run_test_yaml.sh` for CI: ```yaml - _target_: Qwen/Qwen3-8B/megatron_lm_ptq.yaml diff --git a/launcher/CLAUDE.md b/launcher/CLAUDE.md new file mode 100644 index 000000000..288923272 --- /dev/null +++ b/launcher/CLAUDE.md @@ -0,0 +1,117 @@ +# CLAUDE.md — ModelOpt Launcher + +## Overview + +The launcher submits ModelOpt quantization, training, and evaluation jobs to Slurm clusters or runs them locally with Docker. It shares core logic (`core.py`) with [nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)'s `slurm.py`. + +## Key Files + +| File | Role | +|------|------| +| `launch.py` | Public entrypoint — accepts `--yaml` or `pipeline=@` | +| `core.py` | Shared dataclasses, executor builders, run loop, version reporting | +| `slurm_config.py` | `SlurmConfig` dataclass and env-var-driven `slurm_factory` | +| `common/` | Shell scripts and `query.py` packaged to the cluster | +| `modules/Megatron-LM/` | Git submodule | +| `modules/Model-Optimizer` | Symlink to `../..` (auto-created by `launch.py` if missing) | + +## Common Commands + +```shell +# Run locally with Docker +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + +# Run on Slurm (set env vars first) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# Dry run — preview resolved config +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v + +# Dump resolved config +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml + +# Run unit tests +uv pip install pytest +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher +``` + +## YAML Config Format + +The `--yaml` format maps top-level keys to `launch()` function arguments: + +```yaml +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + global_vars: + hf_local: /hf-local/ + task_0: + script: common/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name <<global_vars.hf_local>>abisee/cnn_dailymail + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - HF_MODEL_CKPT: <<global_vars.hf_local>>Qwen/Qwen3-8B + - TP: 4 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 +``` + +Key conventions: + +- Scripts go in `common/` (not `services/`) +- `<<global_vars.X>>` interpolation for shared values across tasks +- `_factory_: "slurm_factory"` — resolved via `register_factory()` in `core.py` +- Environment is list-of-single-key-dicts: `- KEY: value` +- CLI overrides: `pipeline.task_0.slurm_config.nodes=2` + +## Architecture + +```text +launch.py → imports core.py + slurm_config.py + ↓ + core.run_jobs() + ↓ + build_docker_executor() or build_slurm_executor() + ↓ + nemo_run.Experiment → Docker or Slurm +``` + +- `set_slurm_config_type(SlurmConfig)` — patches `SandboxTask` annotation at import time +- `register_factory("slurm_factory", slurm_factory)` — enables YAML `_factory_` resolution +- `report_versions(base_dir)` — prints git commit/branch for launcher + submodules +- `get_default_env(title)` — returns `(slurm_env, local_env)` dicts + +## Adding a New Model Config + +1. Create `<Org>/<Model>/megatron_lm_ptq.yaml` following the format above +2. Set `MLM_MODEL_CFG` to the HuggingFace repo ID +3. Set `QUANT_CFG` (e.g., `NVFP4_DEFAULT_CFG`, `INT8_DEFAULT_CFG`) +4. Set GPU/node counts based on model size +5. Test: `uv run launch.py --yaml <path> --dryrun --yes -v` + +## Testing + +64 unit tests in `tests/unit/launcher/`. Run standalone without installing `modelopt`: + +```shell +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher +``` + +Tests cover: core dataclasses, factory registry, global_vars interpolation, YAML formats, Docker/Slurm executor construction (mocked), environment merging, metadata writing, and end-to-end Docker launch via subprocess. + +## Compatibility with nmm-sandbox + +The same YAML works with both launchers: + +```shell +# nmm-sandbox (internal) +uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# Model-Optimizer/launcher (public) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` + +Differences: `slurm.py` has internal cluster factories, `job_yaml` batch mode (via `tools/run_job_yaml.sh`), CI review integration, and `SLURM_CLUSTER` env var for factory selection. diff --git a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml index ce7f81224..ea83960ef 100644 --- a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml +++ b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -4,18 +4,32 @@ pipeline: allow_to_fail: false note: + # hf_local: path prefix for model weights and datasets. + # + # This should be a self-managed directory that mirrors the HuggingFace Hub + # hierarchy (e.g., /hf-local/Qwen/Qwen3-8B/, /hf-local/cais/mmlu/). Using + # a dedicated folder is preferred over the HuggingFace cache (~/.cache/huggingface) + # to avoid cache corruption issues with concurrent jobs. + # + # Override on CLI: + # pipeline.global_vars.hf_local=/mnt/my-models/ # use a different path + # pipeline.global_vars.hf_local="" # download from HuggingFace Hub + global_vars: + hf_local: /hf-local/ + task_0: script: common/megatron-lm/quantize/quantize.sh args: - - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-dataset-path-or-name <<global_vars.hf_local>>abisee/cnn_dailymail - --calib-size 32 environment: - MLM_MODEL_CFG: Qwen/Qwen3-8B - QUANT_CFG: NVFP4_DEFAULT_CFG + - HF_MODEL_CKPT: <<global_vars.hf_local>>Qwen/Qwen3-8B + - MMLU_DATASET: <<global_vars.hf_local>>cais/mmlu - TP: 4 slurm_config: - _factory_: "slurm_factory" # oci_hsg_slurm_factory + _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 4 gpus_per_node: 4 - diff --git a/launcher/README.md b/launcher/README.md index 725363341..d5365a2fc 100644 --- a/launcher/README.md +++ b/launcher/README.md @@ -31,6 +31,41 @@ uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes | `HF_TOKEN` | HuggingFace API token | No | | `NEMORUN_HOME` | NeMo Run home directory (default: cwd) | No | +## Model and Dataset Storage (`hf_local`) + +Pipeline YAMLs use a `global_vars.hf_local` path prefix for model weights and datasets. This should be a **self-managed directory that mirrors the HuggingFace Hub hierarchy**: + +```text +/hf-local/ +├── Qwen/Qwen3-8B/ # model weights +├── meta-llama/Llama-3.1-8B/ # model weights +├── abisee/cnn_dailymail/ # calibration dataset +└── cais/mmlu/ # evaluation dataset +``` + +Using a dedicated folder is preferred over the HuggingFace cache (`~/.cache/huggingface`) to avoid cache corruption from concurrent jobs writing to the same cache directory. + +You can populate it by copying or symlinking from an existing HuggingFace download: + +```bash +# Example: download a model and copy to hf_local +huggingface-cli download Qwen/Qwen3-8B --local-dir /hf-local/Qwen/Qwen3-8B +``` + +Override `hf_local` in any YAML via CLI: + +```bash +# Use a different local path +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.global_vars.hf_local=/mnt/my-models/ --yes + +# Download from HuggingFace Hub directly (no local cache) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.global_vars.hf_local="" --yes +``` + +For Slurm clusters, `SLURM_HF_LOCAL` sets the container mount path (e.g., `/lustre/.../hf-local:/hf-local`). + ## Directory Structure ```text @@ -39,18 +74,27 @@ launcher/ ├── core.py # Shared logic (also used by nmm-sandbox's slurm.py) ├── slurm_config.py # SlurmConfig dataclass and factory ├── pyproject.toml # Dependencies (nemo-run, pyyaml) -├── services/ # Shell scripts executed on the cluster +├── common/ # Shared scripts executed on the cluster │ ├── service_utils.sh # Error handling, MPI rank utilities -│ └── megatron-lm/quantize/ -│ ├── quantize.sh # PTQ quantization + MMLU evaluation -│ └── Qwen3-8B.yaml # Task config for Qwen3-8B -├── Qwen/Qwen3-8B/ # Example pipeline config -│ └── megatron_lm_ptq.yaml -└── modules/ # Git submodules - ├── Megatron-LM/ # NVIDIA Megatron-LM training framework - └── Model-Optimizer/ # NVIDIA ModelOpt library +│ ├── query.py # OpenAI-compatible query client +│ ├── megatron-lm/quantize/ +│ │ └── quantize.sh # PTQ quantization + MMLU evaluation +│ ├── tensorrt-llm/query.sh # TRT-LLM server launch + query +│ ├── vllm/query.sh # vLLM server launch + query +│ ├── eagle3/ # EAGLE3 speculative decoding scripts +│ └── specdec_bench/ # Speculative decoding benchmark +├── Qwen/Qwen3-8B/ # Example configs +│ ├── megatron_lm_ptq.yaml # PTQ quantization pipeline +│ └── hf_offline_eagle3.yaml # EAGLE3 offline pipeline +└── modules/ # Dependencies + ├── Megatron-LM/ # Git submodule: NVIDIA Megatron-LM + └── Model-Optimizer -> ../.. # Symlink to parent (auto-created if missing) ``` +> **Note:** `modules/Model-Optimizer` is a symlink to the parent directory (`../..`), +> not a submodule. This avoids recursive nesting. `launch.py` auto-creates +> the symlink on first run if it's missing. + ## YAML Config Format A config YAML defines the job name, pipeline metadata, and one or more tasks: @@ -63,14 +107,14 @@ pipeline: note: task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh args: - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail - --calib-size 32 environment: - MLM_MODEL_CFG: Qwen/Qwen3-8B - QUANT_CFG: NVFP4_DEFAULT_CFG - - TP: 1 + - TP: 4 slurm_config: _factory_: "slurm_factory" nodes: 1 @@ -80,7 +124,8 @@ pipeline: ### Multi-task Pipeline -Tasks run sequentially — `task_1` starts only after `task_0` completes: +Tasks run sequentially — `task_1` starts only after `task_0` completes. +Example (illustrative — export script may not exist yet): ```yaml job_name: Qwen3-8B_quantize_export @@ -89,7 +134,7 @@ pipeline: hf_model: /hf-local/Qwen/Qwen3-8B task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh environment: - HF_MODEL_CKPT: <<global_vars.hf_model>> slurm_config: @@ -97,7 +142,7 @@ pipeline: nodes: 1 task_1: - script: services/megatron-lm/export/export.sh + script: common/megatron-lm/export/export.sh environment: - HF_MODEL_CKPT: <<global_vars.hf_model>> slurm_config: @@ -119,7 +164,7 @@ The file contains both `job_name` and `pipeline`: job_name: Qwen3-8B_NVFP4 pipeline: task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh slurm_config: _factory_: "slurm_factory" ``` @@ -130,7 +175,7 @@ This is useful for reusing pipeline configs across different job names: ```yaml # bare_pipeline.yaml — used with: uv run launch.py pipeline=@bare_pipeline.yaml --yes task_0: - script: services/megatron-lm/quantize/quantize.sh + script: common/megatron-lm/quantize/quantize.sh slurm_config: _factory_: "slurm_factory" ``` @@ -186,7 +231,7 @@ uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml detach=true --yes ## How It Works 1. `launch.py` parses the YAML and creates a `SandboxPipeline` with tasks and `SlurmConfig` -2. Code is packaged via `PatternPackager` — only `modules/Megatron-LM/`, `modules/Model-Optimizer/`, and `services/` are synced +2. Code is packaged via `PatternPackager` — `modules/Megatron-LM/`, `modules/Model-Optimizer/` (via symlink), and `common/` are synced 3. For remote jobs: code is rsynced to the cluster, an sbatch script is generated and submitted via SSH 4. For local jobs: a Docker container is launched with the same container image and mounts 5. The `code/` directory on the cluster mirrors the launcher structure: @@ -196,9 +241,20 @@ code/ ├── modules/ │ ├── Megatron-LM/megatron/... │ └── Model-Optimizer/modelopt/... -└── services/... +└── common/... ``` +## Running Tests + +```bash +cd launcher +uv pip install pytest +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" \ + --confcutdir=../tests/unit/launcher +``` + +64 unit tests cover core dataclasses, factory registry, YAML parsing, Docker/Slurm executor construction, environment merging, and end-to-end Docker launch. + ## Reporting Bugs When filing a bug report, please include: @@ -236,3 +292,7 @@ uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_l # From Model-Optimizer/launcher (public) uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes ``` + +Verified: identical MMLU results (0.719 local, 0.730 OCI-HSG) from both launchers. + +For architecture details, factory system, and Claude Code workflows, see [ADVANCED.md](ADVANCED.md). diff --git a/launcher/common/megatron-lm/quantize/quantize.sh b/launcher/common/megatron-lm/quantize/quantize.sh index d4b3d5248..6e4d21b99 100755 --- a/launcher/common/megatron-lm/quantize/quantize.sh +++ b/launcher/common/megatron-lm/quantize/quantize.sh @@ -38,7 +38,7 @@ EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh" export MLM_EXTRA_ARGS=${@} ${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG} -export MLM_EXTRA_ARGS="--mmlu-dataset /hf-local/cais/mmlu --fraction 0.01 --lower-bound 0.38 --disable-tqdm" +export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound 0.38 --disable-tqdm" MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG} ################################################################################################### diff --git a/launcher/core.py b/launcher/core.py index 18e22dfe8..a722767cf 100644 --- a/launcher/core.py +++ b/launcher/core.py @@ -144,6 +144,7 @@ class GlobalVariables: hf_model: str = None hf_data: str = None + hf_local: str = None @dataclass diff --git a/launcher/launch.py b/launcher/launch.py index 5b90d9acf..934104264 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -46,6 +46,12 @@ LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) MODELOPT_ROOT = os.path.dirname(LAUNCHER_DIR) +# Ensure modules/Model-Optimizer symlink exists (points to parent Model-Optimizer root) +_mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") +if not os.path.exists(_mo_symlink): + os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) + os.symlink(os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink) + EXPERIMENT_TITLE = "cicd" DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) @@ -56,10 +62,9 @@ "modules/Megatron-LM/*.py", "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/examples/*", - "services/*", "common/*", ], - relative_path=[LAUNCHER_DIR] * 7, + relative_path=[LAUNCHER_DIR] * 6, ) MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") diff --git a/uv.lock b/uv.lock index 5849559ad..0f36f2dbb 100644 --- a/uv.lock +++ b/uv.lock @@ -16,9 +16,6 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", ] -[manifest] -overrides = [{ name = "torch", marker = "sys_platform == 'never'" }] - [[package]] name = "accelerate" version = "1.13.0" @@ -31,7 +28,7 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } wheels = [ @@ -407,6 +404,19 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/54/27/01d9078a77b9e31b79b9716e66ca4db74f4744c5232bcb3e8769395c4280/cppimport-22.8.2.tar.gz", hash = "sha256:bbb4957102db41bc99ad72c233bce92f9d1fd91be352fc07878c4361033a401f", size = 26635, upload-time = "2022-08-02T16:50:36.872Z" } +[[package]] +name = "cuda-bindings" +version = "12.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cuda-pathfinder", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/d8/b546104b8da3f562c1ff8ab36d130c8fe1dd6a045ced80b4f6ad74f7d4e1/cuda_bindings-12.9.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d3c842c2a4303b2a580fe955018e31aea30278be19795ae05226235268032e5", size = 12148218, upload-time = "2025-10-21T14:51:28.855Z" }, + { url = "https://files.pythonhosted.org/packages/45/e7/b47792cc2d01c7e1d37c32402182524774dadd2d26339bd224e0e913832e/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c912a3d9e6b6651853eed8eed96d6800d69c08e94052c292fec3f282c5a817c9", size = 12210593, upload-time = "2025-10-21T14:51:36.574Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, +] + [[package]] name = "cuda-pathfinder" version = "1.4.1" @@ -478,7 +488,7 @@ dependencies = [ { name = "psutil", marker = "sys_platform != 'win32'" }, { name = "py-cpuinfo", marker = "sys_platform != 'win32'" }, { name = "pydantic", marker = "sys_platform != 'win32'" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch", marker = "sys_platform != 'win32'" }, { name = "tqdm", marker = "sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/00/ad310cf94e0e397c416087e6c4dc782429292206b2b1a3ffbd388ac95a67/deepspeed-0.18.7.tar.gz", hash = "sha256:3763530196f8e7df8fc56d028a8c64409200695213920dc6cf0045d50c884079", size = 1646894, upload-time = "2026-03-05T20:44:56.579Z" } @@ -1106,7 +1116,9 @@ name = "networkx" version = "3.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'win32'", "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version < '3.11' and sys_platform == 'darwin')", + "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'win32'", "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } @@ -1119,10 +1131,14 @@ name = "networkx" version = "3.6.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'win32'", "(python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version >= '3.12' and sys_platform == 'darwin')", "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version == '3.11.*' and sys_platform == 'darwin')", "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", "python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", + "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'win32'", ] sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } wheels = [ @@ -1274,6 +1290,108 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/57/a7/b35835e278c18b85206834b3aa3abe68e77a98769c59233d1f6300284781/numpy-2.4.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:4b42639cdde6d24e732ff823a3fa5b701d8acad89c4142bc1d0bd6dc85200ba5", size = 12504685, upload-time = "2026-03-09T07:58:50.525Z" }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, +] + [[package]] name = "nvidia-ml-py" version = "13.590.48" @@ -1300,7 +1418,7 @@ dependencies = [ { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "setuptools" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "tqdm" }, ] @@ -1505,6 +1623,38 @@ requires-dist = [ ] provides-extras = ["onnx", "hf", "dev-lint", "dev-docs", "dev-test", "all", "dev"] +[[package]] +name = "nvidia-nccl-cu12" +version = "2.27.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, +] + [[package]] name = "onnx" version = "1.19.1" @@ -1829,7 +1979,7 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "tqdm" }, { name = "transformers" }, ] @@ -2882,7 +3032,7 @@ dependencies = [ { name = "huggingface-hub" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "torchvision" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d7/2c/593109822fe735e637382aca6640c1102c19797f7791f1fd1dab2d6c3cb1/timm-1.0.25.tar.gz", hash = "sha256:47f59fc2754725735cc81bb83bcbfce5bec4ebd5d4bb9e69da57daa92fcfa768", size = 2414743, upload-time = "2026-02-23T16:49:00.137Z" } @@ -2961,15 +3111,52 @@ name = "torch" version = "2.10.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "cuda-bindings", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/30/bfebdd8ec77db9a79775121789992d6b3b75ee5494971294d7b4b7c999bc/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2b980edd8d7c0a68c4e951ee1856334a43193f98730d97408fbd148c1a933313", size = 79411457, upload-time = "2026-02-10T21:44:59.189Z" }, + { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" }, + { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/16/ee/efbd56687be60ef9af0c9c0ebe106964c07400eade5b0af8902a1d8cd58c/torch-2.10.0-3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a1ff626b884f8c4e897c4c33782bdacdff842a165fee79817b1dd549fdda1321", size = 915510070, upload-time = "2026-03-11T14:16:39.386Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/7b562f1808d3f65414cd80a4f7d4bb00979d9355616c034c171249e1a303/torch-2.10.0-3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ac5bdcbb074384c66fa160c15b1ead77839e3fe7ed117d667249afce0acabfac", size = 915518691, upload-time = "2026-03-11T14:15:43.147Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" }, + { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" }, + { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" }, + { url = "https://files.pythonhosted.org/packages/76/bb/d820f90e69cda6c8169b32a0c6a3ab7b17bf7990b8f2c680077c24a3c14c/torch-2.10.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:35e407430795c8d3edb07a1d711c41cc1f9eaddc8b2f1cc0a165a6767a8fb73d", size = 79411450, upload-time = "2026-01-21T16:25:30.692Z" }, + { url = "https://files.pythonhosted.org/packages/78/89/f5554b13ebd71e05c0b002f95148033e730d3f7067f67423026cc9c69410/torch-2.10.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3282d9febd1e4e476630a099692b44fdc214ee9bf8ee5377732d9d9dfe5712e4", size = 145992610, upload-time = "2026-01-21T16:25:26.327Z" }, + { url = "https://files.pythonhosted.org/packages/ae/30/a3a2120621bf9c17779b169fc17e3dc29b230c29d0f8222f499f5e159aa8/torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a2f9edd8dbc99f62bc4dfb78af7bf89499bca3d753423ac1b4e06592e467b763", size = 915607863, upload-time = "2026-01-21T16:25:06.696Z" }, + { url = "https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:29b7009dba4b7a1c960260fc8ac85022c784250af43af9fb0ebafc9883782ebd", size = 113723116, upload-time = "2026-01-21T16:25:21.916Z" }, + { url = "https://files.pythonhosted.org/packages/61/d8/15b9d9d3a6b0c01b883787bd056acbe5cc321090d4b216d3ea89a8fcfdf3/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b7bd80f3477b830dd166c707c5b0b82a898e7b16f59a7d9d42778dd058272e8b", size = 79423461, upload-time = "2026-01-21T16:24:50.266Z" }, + { url = "https://files.pythonhosted.org/packages/cc/af/758e242e9102e9988969b5e621d41f36b8f258bb4a099109b7a4b4b50ea4/torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5fd4117d89ffd47e3dcc71e71a22efac24828ad781c7e46aaaf56bf7f2796acf", size = 145996088, upload-time = "2026-01-21T16:24:44.171Z" }, + { url = "https://files.pythonhosted.org/packages/23/8e/3c74db5e53bff7ed9e34c8123e6a8bfef718b2450c35eefab85bb4a7e270/torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:787124e7db3b379d4f1ed54dd12ae7c741c16a4d29b49c0226a89bea50923ffb", size = 915711952, upload-time = "2026-01-21T16:23:53.503Z" }, + { url = "https://files.pythonhosted.org/packages/6e/01/624c4324ca01f66ae4c7cd1b74eb16fb52596dce66dbe51eff95ef9e7a4c/torch-2.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c66c61f44c5f903046cc696d088e21062644cbe541c7f1c4eaae88b2ad23547", size = 113757972, upload-time = "2026-01-21T16:24:39.516Z" }, + { url = "https://files.pythonhosted.org/packages/c9/5c/dee910b87c4d5c0fcb41b50839ae04df87c1cfc663cf1b5fca7ea565eeaa/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6d3707a61863d1c4d6ebba7be4ca320f42b869ee657e9b2c21c736bf17000294", size = 79498198, upload-time = "2026-01-21T16:24:34.704Z" }, +] [[package]] name = "torch-geometric" @@ -2999,7 +3186,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "torchvision" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/36/574c0c46e818533b78b3c09505211162918188325ab4165ef11a3f295755/torchprofile-0.0.4.tar.gz", hash = "sha256:96b6da17d752a06b02977e078aea95614893b31d4117dd5dcd081f30ce65611b", size = 4557, upload-time = "2021-06-22T04:58:03.592Z" } @@ -3015,7 +3202,7 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pillow" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/50/ae/cbf727421eb73f1cf907fbe5788326a08f111b3f6b6ddca15426b53fec9a/torchvision-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a95c47abb817d4e90ea1a8e57bd0d728e3e6b533b3495ae77d84d883c4d11f56", size = 1874919, upload-time = "2026-01-21T16:27:47.617Z" }, @@ -3100,6 +3287,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, ] +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" }, + { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" From 4a05a1d1ad4ddb05fa43072c61990fc2466faf1f Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sat, 14 Mar 2026 21:32:48 -0700 Subject: [PATCH 09/12] fix: skip launcher tests when nemo_run not installed, add docstrings Skip all launcher tests with pytest.skip when nemo_run is not available (CI tox env doesn't have it). Add docstrings to __post_init__ and _resolve for 100% docstring coverage. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- launcher/core.py | 2 ++ tests/unit/launcher/conftest.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/launcher/core.py b/launcher/core.py index a722767cf..de2f5b061 100644 --- a/launcher/core.py +++ b/launcher/core.py @@ -171,6 +171,7 @@ class SandboxPipeline: _factory_lookup: dict = None def __post_init__(self): + """Collect tasks from slots/configs and resolve <<global_vars.X>> references.""" if self.tasks is None: self.tasks = [] for i in range(5): @@ -191,6 +192,7 @@ def __post_init__(self): } def _resolve(s): + """Replace <<global_vars.X>> with the corresponding value.""" if not isinstance(s, str): return s return re.sub( diff --git a/tests/unit/launcher/conftest.py b/tests/unit/launcher/conftest.py index d19ced583..44fd9d936 100644 --- a/tests/unit/launcher/conftest.py +++ b/tests/unit/launcher/conftest.py @@ -15,10 +15,12 @@ """Fixtures for launcher unit tests. -These tests can be run standalone without installing modelopt: +These tests require nemo_run and are skipped when it's not installed. + +Standalone run (from launcher/ directory): cd Model-Optimizer/launcher uv pip install pytest - uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --rootdir=. + uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher """ import os @@ -26,8 +28,11 @@ import pytest -# Prevent pytest from loading the root conftest.py (which imports torch/modelopt) -collect_ignore_glob = ["../../conftest.py"] +# Skip all tests in this directory if nemo_run is not installed +try: + import nemo_run # noqa: F401 +except ImportError: + pytest.skip("nemo_run not installed, skipping launcher tests", allow_module_level=True) @pytest.fixture(autouse=True) From edaaab06e84cd30d08657966e60e4e8788bbcc24 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sat, 14 Mar 2026 21:43:13 -0700 Subject: [PATCH 10/12] chg: move launcher tests to launcher/tests/, add CI workflow Move tests from tests/unit/launcher/ to launcher/tests/ for self-containment. Add launcher job to unit_tests.yml using uv. Add pytest.ini to override root pyproject.toml addopts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- .github/workflows/unit_tests.yml | 21 +++++++++++++++++-- launcher/CLAUDE.md | 2 ++ launcher/pytest.ini | 2 ++ .../launcher => launcher/tests}/__init__.py | 0 .../launcher => launcher/tests}/conftest.py | 18 ++++++---------- .../launcher => launcher/tests}/test_core.py | 1 + .../tests}/test_core_extended.py | 1 + .../tests}/test_docker_execution.py | 1 + .../tests}/test_docker_launch.py | 4 ++-- .../tests}/test_slurm_config.py | 1 + .../tests}/test_slurm_executor.py | 1 + .../tests}/test_yaml_formats.py | 1 - 12 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 launcher/pytest.ini rename {tests/unit/launcher => launcher/tests}/__init__.py (100%) rename {tests/unit/launcher => launcher/tests}/conftest.py (70%) rename {tests/unit/launcher => launcher/tests}/test_core.py (99%) rename {tests/unit/launcher => launcher/tests}/test_core_extended.py (99%) rename {tests/unit/launcher => launcher/tests}/test_docker_execution.py (99%) rename {tests/unit/launcher => launcher/tests}/test_docker_launch.py (98%) rename {tests/unit/launcher => launcher/tests}/test_slurm_config.py (99%) rename {tests/unit/launcher => launcher/tests}/test_slurm_executor.py (99%) rename {tests/unit/launcher => launcher/tests}/test_yaml_formats.py (99%) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f9745ce3c..3156efcab 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -12,6 +12,7 @@ on: - "tests/unit/**" - "pyproject.toml" - "tox.ini" + - "launcher/**" schedule: - cron: "0 0 * * *" # Nightly workflow_dispatch: # On-demand @@ -98,6 +99,21 @@ jobs: - uses: ./.github/actions/ubuntu-setup - name: Run unit tests run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit + launcher: + if: github.event_name == 'pull_request' + needs: [linux] + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v6 + with: + submodules: recursive + - name: Run launcher tests + working-directory: launcher + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + uv pip install pytest + uv run python3 -m pytest -v partial-install: if: github.event_name == 'pull_request' needs: [linux] @@ -114,7 +130,7 @@ jobs: unit-pr-required-check: # Run even if some jobs are skipped if: ${{ github.event_name == 'pull_request' && always() }} - needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install] + needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install, launcher] runs-on: ubuntu-latest steps: - name: Required unit tests did not succeed @@ -124,5 +140,6 @@ jobs: needs.multi-py.result != 'success' || needs.multi-torch.result != 'success' || needs.multi-transformers.result != 'success' || - needs.partial-install.result != 'success' }} + needs.partial-install.result != 'success' || + needs.launcher.result != 'success' }} run: exit 1 diff --git a/launcher/CLAUDE.md b/launcher/CLAUDE.md index 288923272..3cc03a67e 100644 --- a/launcher/CLAUDE.md +++ b/launcher/CLAUDE.md @@ -96,6 +96,8 @@ launch.py → imports core.py + slurm_config.py 64 unit tests in `tests/unit/launcher/`. Run standalone without installing `modelopt`: +From the launcher directory: + ```shell uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher ``` diff --git a/launcher/pytest.ini b/launcher/pytest.ini new file mode 100644 index 000000000..5ee647716 --- /dev/null +++ b/launcher/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/tests/unit/launcher/__init__.py b/launcher/tests/__init__.py similarity index 100% rename from tests/unit/launcher/__init__.py rename to launcher/tests/__init__.py diff --git a/tests/unit/launcher/conftest.py b/launcher/tests/conftest.py similarity index 70% rename from tests/unit/launcher/conftest.py rename to launcher/tests/conftest.py index 44fd9d936..bb6ccb045 100644 --- a/tests/unit/launcher/conftest.py +++ b/launcher/tests/conftest.py @@ -15,12 +15,13 @@ """Fixtures for launcher unit tests. -These tests require nemo_run and are skipped when it's not installed. - -Standalone run (from launcher/ directory): +Run from the launcher directory: cd Model-Optimizer/launcher uv pip install pytest - uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher + uv run python3 -m pytest tests/ -v + +Or via tox from Model-Optimizer root: + tox -e py312-launcher """ import os @@ -28,18 +29,11 @@ import pytest -# Skip all tests in this directory if nemo_run is not installed -try: - import nemo_run # noqa: F401 -except ImportError: - pytest.skip("nemo_run not installed, skipping launcher tests", allow_module_level=True) - @pytest.fixture(autouse=True) def add_launcher_to_path(): """Add the launcher directory to sys.path so core.py and slurm_config.py can be imported.""" - launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") - launcher_dir = os.path.abspath(launcher_dir) + launcher_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if launcher_dir not in sys.path: sys.path.insert(0, launcher_dir) yield diff --git a/tests/unit/launcher/test_core.py b/launcher/tests/test_core.py similarity index 99% rename from tests/unit/launcher/test_core.py rename to launcher/tests/test_core.py index 69c0fc40d..6c7e8f043 100644 --- a/tests/unit/launcher/test_core.py +++ b/launcher/tests/test_core.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ruff: noqa: D102 """Tests for launcher/core.py — shared dataclasses, factory registry, and utilities. Coverage: diff --git a/tests/unit/launcher/test_core_extended.py b/launcher/tests/test_core_extended.py similarity index 99% rename from tests/unit/launcher/test_core_extended.py rename to launcher/tests/test_core_extended.py index 698c5b438..9d4ba5604 100644 --- a/tests/unit/launcher/test_core_extended.py +++ b/launcher/tests/test_core_extended.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ruff: noqa: D102 """Extended tests for launcher/core.py — edge cases and remaining coverage gaps. Coverage: diff --git a/tests/unit/launcher/test_docker_execution.py b/launcher/tests/test_docker_execution.py similarity index 99% rename from tests/unit/launcher/test_docker_execution.py rename to launcher/tests/test_docker_execution.py index 693071bb3..6d3fa0fa7 100644 --- a/tests/unit/launcher/test_docker_execution.py +++ b/launcher/tests/test_docker_execution.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ruff: noqa: D102 """Tests for Docker execution path — verifies build_docker_executor and run_jobs with mocked Docker. Coverage: diff --git a/tests/unit/launcher/test_docker_launch.py b/launcher/tests/test_docker_launch.py similarity index 98% rename from tests/unit/launcher/test_docker_launch.py rename to launcher/tests/test_docker_launch.py index 8baad32c8..625d28b08 100644 --- a/tests/unit/launcher/test_docker_launch.py +++ b/launcher/tests/test_docker_launch.py @@ -57,7 +57,7 @@ def test_echo_script_via_launch(self, tmp_path): yaml_path.write_text(yaml_content) # Run launch.py as a subprocess (avoids pytest stdin capture issues) - launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") + launcher_dir = os.path.join(os.path.dirname(__file__), "..") launcher_dir = os.path.abspath(launcher_dir) result = subprocess.run( @@ -100,7 +100,7 @@ def test_failing_script_via_launch(self, tmp_path): yaml_path = tmp_path / "fail_test.yaml" yaml_path.write_text(yaml_content) - launcher_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "launcher") + launcher_dir = os.path.join(os.path.dirname(__file__), "..") launcher_dir = os.path.abspath(launcher_dir) result = subprocess.run( diff --git a/tests/unit/launcher/test_slurm_config.py b/launcher/tests/test_slurm_config.py similarity index 99% rename from tests/unit/launcher/test_slurm_config.py rename to launcher/tests/test_slurm_config.py index aeb09200e..b23c46c24 100644 --- a/tests/unit/launcher/test_slurm_config.py +++ b/launcher/tests/test_slurm_config.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ruff: noqa: D102 """Tests for launcher/slurm_config.py — SlurmConfig dataclass and factory. Coverage: diff --git a/tests/unit/launcher/test_slurm_executor.py b/launcher/tests/test_slurm_executor.py similarity index 99% rename from tests/unit/launcher/test_slurm_executor.py rename to launcher/tests/test_slurm_executor.py index 48004c786..d7ac7827f 100644 --- a/tests/unit/launcher/test_slurm_executor.py +++ b/launcher/tests/test_slurm_executor.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# ruff: noqa: D102 """Tests for build_slurm_executor — container mounts, scratch paths, executor params. Note: actual SSH tunnel and sbatch submission are not tested (require live infra). diff --git a/tests/unit/launcher/test_yaml_formats.py b/launcher/tests/test_yaml_formats.py similarity index 99% rename from tests/unit/launcher/test_yaml_formats.py rename to launcher/tests/test_yaml_formats.py index 571535343..981c32216 100644 --- a/tests/unit/launcher/test_yaml_formats.py +++ b/launcher/tests/test_yaml_formats.py @@ -58,7 +58,6 @@ def test_yaml_format_with_job_name(self, tmp_yaml): def test_bare_pipeline_format(self, tmp_yaml): """The pipeline=@ format is a bare SandboxPipeline without wrapper.""" - content = """ task_0: script: a.sh From 472c091be6a44c732799294d0ada943b2e383f78 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sun, 15 Mar 2026 09:18:33 -0700 Subject: [PATCH 11/12] fix: create venv before uv pip install in launcher CI job Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- .github/workflows/unit_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 3156efcab..fc23c8c23 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -112,6 +112,8 @@ jobs: working-directory: launcher run: | curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" + uv venv .venv uv pip install pytest uv run python3 -m pytest -v partial-install: From 410de11a8b4f700a24b2254f65f45937b62e8446 Mon Sep 17 00:00:00 2001 From: Chenhan Yu <chenhany@nvidia.com> Date: Sun, 15 Mar 2026 10:57:30 -0700 Subject: [PATCH 12/12] fix: use nemo-run from PyPI, install project deps in CI Switch from git-pinned nemo-run to nemo-run>=0.8.0 from PyPI (avoids uv TOML parse error). Add py-modules=[] to prevent setuptools auto- discovery. CI installs project with `uv pip install -e . pytest`. Add ModelOpt mount mechanism docs to ADVANCED.md. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Chenhan Yu <chenhany@nvidia.com> --- .github/workflows/unit_tests.yml | 2 +- launcher/ADVANCED.md | 29 ++++++++++++++++++++++++++++- launcher/pyproject.toml | 5 ++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index fc23c8c23..006056ac0 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -114,7 +114,7 @@ jobs: curl -LsSf https://astral.sh/uv/install.sh | sh export PATH="$HOME/.local/bin:$PATH" uv venv .venv - uv pip install pytest + uv pip install -e . pytest uv run python3 -m pytest -v partial-install: if: github.event_name == 'pull_request' diff --git a/launcher/ADVANCED.md b/launcher/ADVANCED.md index 8698f4ce8..cc8678c95 100644 --- a/launcher/ADVANCED.md +++ b/launcher/ADVANCED.md @@ -44,7 +44,34 @@ code/ └── query.py # OpenAI-compatible query client ``` -The `modelopt/` directory is bind-mounted over the container's installed ModelOpt, so your local changes take effect without rebuilding the container. +### ModelOpt Mount Mechanism + +The container image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5`) ships with a pre-installed version of ModelOpt at a fixed path like `/usr/local/lib/python3.12/dist-packages/modelopt`. The launcher **bind-mounts your local `modelopt/` over this path**, so your local changes take effect without rebuilding the container. + +The mount is configured via `modelopt_install_path` in `SlurmConfig`: + +```yaml +slurm_config: + modelopt_install_path: /usr/local/lib/python3.12/dist-packages/modelopt +``` + +At runtime, the executor constructs the mount: + +- **Slurm**: `{job_dir}/{experiment_title}/{exp_id}/{task}/code/modules/Model-Optimizer/modelopt` → `{modelopt_install_path}` +- **Docker**: `{LAUNCHER_DIR}/modules/Model-Optimizer/modelopt` → `{modelopt_install_path}` (follows the symlink to the parent's `modelopt/`) + +This means: + +1. You can edit `modelopt/` source code locally +2. Submit a job — the packager tars your changes and ships them to the cluster +3. On the cluster, the container sees your modified `modelopt/` instead of the pre-installed one +4. No container rebuild needed for iterating on ModelOpt changes + +The `modelopt_install_path` varies by container image. Check with: + +```bash +docker run --rm <image> python3 -c "import modelopt; print(modelopt.__file__)" +``` ### Model-Optimizer Symlink diff --git a/launcher/pyproject.toml b/launcher/pyproject.toml index 0e576e5af..6ecc201e8 100644 --- a/launcher/pyproject.toml +++ b/launcher/pyproject.toml @@ -4,9 +4,12 @@ version = "0.1.0" description = "ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters" requires-python = ">=3.10" dependencies = [ - "nemo-run@git+https://github.com/NVIDIA-NeMo/Run@2ccf1c9e68acd157da451721b24635bcc83be87e", + "nemo-run>=0.8.0", "pyyaml", ] +[tool.setuptools] +py-modules = [] + [dependency-groups] dev = []