diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f9745ce3c..006056ac0 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -12,6 +12,7 @@ on: - "tests/unit/**" - "pyproject.toml" - "tox.ini" + - "launcher/**" schedule: - cron: "0 0 * * *" # Nightly workflow_dispatch: # On-demand @@ -98,6 +99,23 @@ jobs: - uses: ./.github/actions/ubuntu-setup - name: Run unit tests run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit + launcher: + if: github.event_name == 'pull_request' + needs: [linux] + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v6 + with: + submodules: recursive + - name: Run launcher tests + working-directory: launcher + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" + uv venv .venv + uv pip install -e . pytest + uv run python3 -m pytest -v partial-install: if: github.event_name == 'pull_request' needs: [linux] @@ -114,7 +132,7 @@ jobs: unit-pr-required-check: # Run even if some jobs are skipped if: ${{ github.event_name == 'pull_request' && always() }} - needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install] + needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install, launcher] runs-on: ubuntu-latest steps: - name: Required unit tests did not succeed @@ -124,5 +142,6 @@ jobs: needs.multi-py.result != 'success' || needs.multi-torch.result != 'success' || needs.multi-transformers.result != 'success' || - needs.partial-install.result != 'success' }} + needs.partial-install.result != 'success' || + needs.launcher.result != 'success' }} run: exit 1 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..87630967d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "launcher/modules/Megatron-LM"] + path = launcher/modules/Megatron-LM + url = https://github.com/AAnoosheh/Megatron-LM.git diff --git a/launcher/.gitignore b/launcher/.gitignore new file mode 100644 index 000000000..3eb4a4907 --- /dev/null +++ b/launcher/.gitignore @@ -0,0 +1,22 @@ +# Virtual environment +.venv/ + +# nemo-run state +.slurm_jobs +.docker_jobs.json +.local_jobs.json + +# Experiment artifacts (generated at runtime) +experiments/ +local_experiments/ + +# uv lock (generated, not portable) +uv.lock + +# Python cache +__pycache__/ + +# Editor swap files +*.swp +*.swo +*~ diff --git a/launcher/ADVANCED.md b/launcher/ADVANCED.md new file mode 100644 index 000000000..cc8678c95 --- /dev/null +++ b/launcher/ADVANCED.md @@ -0,0 +1,281 @@ +# Advanced Guide + +## Architecture + +### Shared Core + +The launcher is built on a shared `core.py` module used by both: + +- **`launch.py`** — public-facing launcher (this repo) +- **`slurm.py`** — internal CI orchestrator ([nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)) + +```text +core.py (shared) +├── Dataclasses: SandboxTask, SandboxPipeline, GlobalVariables +├── Executor builders: build_slurm_executor(), build_docker_executor() +├── Job runner: run_jobs() +├── Version reporter: report_versions() +├── Factory registry: register_factory(), set_slurm_config_type() +└── Default env: get_default_env() + +launch.py slurm.py (nmm-sandbox) +├── imports core.py ├── imports core.py (via sys.path) +├── slurm_config.py (env-var driven) ├── tools/slurm_config.py (cluster-specific) +├── registers: slurm_factory ├── registers: oci_hsg, cw_dfw, computelab, ... +├── packager (LAUNCHER_DIR relative) ├── packager (repo root relative) +└── launch() entrypoint └── cicd() entrypoint +``` + +### Code Packaging + +When a job is submitted, `PatternPackager` creates a tar.gz of the source code and rsyncs it to the cluster. The `code/` directory on the remote mirrors the launcher structure: + +```text +code/ +├── modules/ +│ ├── Megatron-LM/megatron/... # Training framework +│ └── Model-Optimizer/modelopt/... # ModelOpt library (mounted over container install) +└── common/ + ├── megatron-lm/quantize/ + │ └── quantize.sh # PTQ quantization + MMLU + ├── tensorrt-llm/query.sh # TRT-LLM server + query + ├── vllm/query.sh # vLLM server + query + ├── eagle3/ # EAGLE3 pipeline scripts + └── query.py # OpenAI-compatible query client +``` + +### ModelOpt Mount Mechanism + +The container image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5`) ships with a pre-installed version of ModelOpt at a fixed path like `/usr/local/lib/python3.12/dist-packages/modelopt`. The launcher **bind-mounts your local `modelopt/` over this path**, so your local changes take effect without rebuilding the container. + +The mount is configured via `modelopt_install_path` in `SlurmConfig`: + +```yaml +slurm_config: + modelopt_install_path: /usr/local/lib/python3.12/dist-packages/modelopt +``` + +At runtime, the executor constructs the mount: + +- **Slurm**: `{job_dir}/{experiment_title}/{exp_id}/{task}/code/modules/Model-Optimizer/modelopt` → `{modelopt_install_path}` +- **Docker**: `{LAUNCHER_DIR}/modules/Model-Optimizer/modelopt` → `{modelopt_install_path}` (follows the symlink to the parent's `modelopt/`) + +This means: + +1. You can edit `modelopt/` source code locally +2. Submit a job — the packager tars your changes and ships them to the cluster +3. On the cluster, the container sees your modified `modelopt/` instead of the pre-installed one +4. No container rebuild needed for iterating on ModelOpt changes + +The `modelopt_install_path` varies by container image. Check with: + +```bash +docker run --rm python3 -c "import modelopt; print(modelopt.__file__)" +``` + +### Model-Optimizer Symlink + +`launcher/modules/Model-Optimizer` is a **symlink** to `../..` (the Model-Optimizer root), not a git submodule. This avoids recursive nesting — the launcher lives inside Model-Optimizer and references its own parent. + +- Git tracks the symlink natively (`git clone` preserves it) +- `launch.py` auto-creates the symlink on first run if it's missing +- The packager's `find` follows symlinks, so `modules/Model-Optimizer/modelopt/*` resolves correctly + +### Factory System + +Slurm cluster configs use a factory pattern. YAMLs reference a factory by name: + +```yaml +slurm_config: + _factory_: "slurm_factory" + nodes: 1 +``` + +Factories are registered at import time via `register_factory()`. In `launch.py`, `slurm_factory` reads from environment variables (`SLURM_HOST`, `SLURM_ACCOUNT`, etc.). In `slurm.py`, `slurm_factory` resolves to a cluster-specific factory based on `SLURM_CLUSTER`: + +```bash +# Default (OCI-HSG) +uv run slurm.py --yaml config.yaml --yes + +# Switch cluster +SLURM_CLUSTER=cw_dfw uv run slurm.py --yaml config.yaml --yes +``` + +### YAML Formats + +**`--yaml` format** (recommended) — maps top-level keys to function args: + +```yaml +job_name: Qwen3-8B_NVFP4 +pipeline: + task_0: + script: common/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**`pipeline=@` format** — bare pipeline without wrapper: + +```yaml +task_0: + script: common/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**Test YAML format** — list of jobs with `_target_` and overrides, used by nmm-sandbox's `tools/run_test_yaml.sh` for CI: + +```yaml +- _target_: Qwen/Qwen3-8B/megatron_lm_ptq.yaml + pipeline: + allow_to_fail: true + skip: false + note: "known flaky" +``` + +Overrides are flattened to dot-notation and passed as nemo-run CLI args (e.g., `pipeline.allow_to_fail=True`). + +### Global Variables + +Pipeline YAMLs support `<>` interpolation for sharing values across tasks: + +```yaml +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + task_0: + environment: + - HF_MODEL_CKPT: <> + + task_1: + environment: + - HF_MODEL_CKPT: <> +``` + +This is resolved in `SandboxPipeline.__post_init__` using regex substitution, not OmegaConf (which fails on isolated sub-configs in nemo-run). + +### Metadata + +Each experiment writes `metadata.json` to `experiments//<id>/`: + +```json +{ + "experiment_id": "cicd_1773420387", + "job_name": "Qwen3-8B_NVFP4_DEFAULT_CFG", + "allow_to_fail": false, + "note": "" +} +``` + +This is used by: + +- `tools/wait_for_experiments.sh` — skip blocking on `allow_to_fail` failures +- `tools/post_review_to_gitlab.sh` — create/update GitLab issues for allowed failures +- Claude Code's `review-logs` skill — emit `<system-out>` instead of `<failure>` in JUnit XML + +## Using Claude Code with the Launcher + +Claude Code can create a tight feedback loop for model quantization experiments: configure → submit → monitor → diagnose → fix → resubmit — all from the CLI. + +### Setup + +Install Claude Code and ensure the launcher is ready: + +```bash +npm install -g @anthropic-ai/claude-code +cd Model-Optimizer/launcher +git submodule update --init --recursive +``` + +### Workflow: Submit and Monitor + +Ask Claude Code to launch a job and wait for results: + +```text +> Run Qwen3-8B quantization on OCI-HSG and wait for it to finish + +Claude will: +1. Run: uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +2. Monitor with: NEMORUN_HOME=$(pwd) uv run nemo experiment status <id> +3. Fetch logs when done: NEMORUN_HOME=$(pwd) uv run nemo experiment logs <id> 0 +4. Report the MMLU score and pass/fail status +``` + +### Workflow: Diagnose Failures + +When a job fails, ask Claude Code to analyze the logs: + +```text +> /review-logs + +Claude will: +1. Find all experiments in experiments/ +2. Fetch logs via nemo experiment logs +3. Read and analyze error tracebacks +4. Produce a structured report with root cause and suggested fix +5. Write a JUnit XML for CI integration +``` + +### Workflow: Add a New Model + +Ask Claude Code to set up a new model config: + +```text +> Add Llama-3.1-70B quantization config. It needs 2 nodes with 4 GPUs each. + +Claude will: +1. Create Meta/Llama-3.1-70B/megatron_lm_ptq.yaml +2. Set appropriate TP/EP based on model size +3. Reference the correct service script +4. Test with --dryrun to verify the config +``` + +### Workflow: Iterate on Failures + +Claude Code can fix issues and resubmit in a loop: + +```text +> The job failed with CUDA OOM. Try reducing the sequence length to 4096 and resubmit. + +Claude will: +1. Edit the YAML config +2. Resubmit with uv run launch.py --yaml <config> --yes +3. Monitor and report results +``` + +### Workflow: Reproduce and Compare + +Use `--to-yaml` to capture configs and compare runs: + +```text +> Dump the resolved config for Qwen3-8B, then run it on both OCI-HSG and CW-DFW + +Claude will: +1. Dump: uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml +2. Run on OCI-HSG: SLURM_CLUSTER=oci_hsg uv run slurm.py --yaml resolved.yaml --yes +3. Run on CW-DFW: SLURM_CLUSTER=cw_dfw uv run slurm.py --yaml resolved.yaml --yes +4. Compare MMLU results +``` + +### Skills + +The following Claude Code skills are available in the nmm-sandbox project: + +| Skill | Trigger | Description | +|---|---|---| +| `/review-logs` | After job completion or failure | Analyze experiment logs, diagnose failures, produce JUnit XML | +| `/wait-for-jobs` | After detached submission | Poll experiment status until all jobs finish | +| `/eagle3-new-model` | Adding a new EAGLE3 model | Generate pipeline YAML for a new model | + +### CI Integration + +In CI, Claude Code runs automatically after each test job to: + +1. Fetch and analyze all experiment logs +2. Generate `claude_analysis.md` with structured findings +3. Write `claude_review_rspec.xml` for GitLab test reporting +4. Post failure summaries as MR comments (via `tools/post_review_to_gitlab.sh`) +5. Create/update GitLab issues for `allow_to_fail` jobs that are consistently failing + +If the main script crashes before the review runs, an `after_script` fallback posts the captured job output to the MR so failures are always visible. diff --git a/launcher/CLAUDE.md b/launcher/CLAUDE.md new file mode 100644 index 000000000..3cc03a67e --- /dev/null +++ b/launcher/CLAUDE.md @@ -0,0 +1,119 @@ +# CLAUDE.md — ModelOpt Launcher + +## Overview + +The launcher submits ModelOpt quantization, training, and evaluation jobs to Slurm clusters or runs them locally with Docker. It shares core logic (`core.py`) with [nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)'s `slurm.py`. + +## Key Files + +| File | Role | +|------|------| +| `launch.py` | Public entrypoint — accepts `--yaml` or `pipeline=@` | +| `core.py` | Shared dataclasses, executor builders, run loop, version reporting | +| `slurm_config.py` | `SlurmConfig` dataclass and env-var-driven `slurm_factory` | +| `common/` | Shell scripts and `query.py` packaged to the cluster | +| `modules/Megatron-LM/` | Git submodule | +| `modules/Model-Optimizer` | Symlink to `../..` (auto-created by `launch.py` if missing) | + +## Common Commands + +```shell +# Run locally with Docker +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + +# Run on Slurm (set env vars first) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# Dry run — preview resolved config +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v + +# Dump resolved config +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml + +# Run unit tests +uv pip install pytest +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher +``` + +## YAML Config Format + +The `--yaml` format maps top-level keys to `launch()` function arguments: + +```yaml +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + global_vars: + hf_local: /hf-local/ + task_0: + script: common/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name <<global_vars.hf_local>>abisee/cnn_dailymail + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - HF_MODEL_CKPT: <<global_vars.hf_local>>Qwen/Qwen3-8B + - TP: 4 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 +``` + +Key conventions: + +- Scripts go in `common/` (not `services/`) +- `<<global_vars.X>>` interpolation for shared values across tasks +- `_factory_: "slurm_factory"` — resolved via `register_factory()` in `core.py` +- Environment is list-of-single-key-dicts: `- KEY: value` +- CLI overrides: `pipeline.task_0.slurm_config.nodes=2` + +## Architecture + +```text +launch.py → imports core.py + slurm_config.py + ↓ + core.run_jobs() + ↓ + build_docker_executor() or build_slurm_executor() + ↓ + nemo_run.Experiment → Docker or Slurm +``` + +- `set_slurm_config_type(SlurmConfig)` — patches `SandboxTask` annotation at import time +- `register_factory("slurm_factory", slurm_factory)` — enables YAML `_factory_` resolution +- `report_versions(base_dir)` — prints git commit/branch for launcher + submodules +- `get_default_env(title)` — returns `(slurm_env, local_env)` dicts + +## Adding a New Model Config + +1. Create `<Org>/<Model>/megatron_lm_ptq.yaml` following the format above +2. Set `MLM_MODEL_CFG` to the HuggingFace repo ID +3. Set `QUANT_CFG` (e.g., `NVFP4_DEFAULT_CFG`, `INT8_DEFAULT_CFG`) +4. Set GPU/node counts based on model size +5. Test: `uv run launch.py --yaml <path> --dryrun --yes -v` + +## Testing + +64 unit tests in `tests/unit/launcher/`. Run standalone without installing `modelopt`: + +From the launcher directory: + +```shell +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" --confcutdir=../tests/unit/launcher +``` + +Tests cover: core dataclasses, factory registry, global_vars interpolation, YAML formats, Docker/Slurm executor construction (mocked), environment merging, metadata writing, and end-to-end Docker launch via subprocess. + +## Compatibility with nmm-sandbox + +The same YAML works with both launchers: + +```shell +# nmm-sandbox (internal) +uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# Model-Optimizer/launcher (public) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` + +Differences: `slurm.py` has internal cluster factories, `job_yaml` batch mode (via `tools/run_job_yaml.sh`), CI review integration, and `SLURM_CLUSTER` env var for factory selection. diff --git a/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml new file mode 100644 index 000000000..19b6cc0d2 --- /dev/null +++ b/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml @@ -0,0 +1,111 @@ +# EAGLE3 offline speculative decoding pipeline for Qwen3-8B. +# +# 4-step pipeline: +# task_0: Data synthesis — query TRT-LLM server to generate prompt samples +# task_1: Dump hidden states — run target model to capture hidden states +# task_2: Offline training — train the EAGLE3 draft head +# task_3: Benchmark — evaluate speculative decoding speedup via VLLM +# +# All tasks share /scratchspace to pass artifacts between steps. +# +# Usage: +# uv run launch.py --yaml Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes +# uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/hf_offline_eagle3.yaml --yes + +job_name: Qwen3-8B_EAGLE3_offline +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + # Step 1: Data synthesis via TRT-LLM server + # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py. + task_0: + script: common/tensorrt-llm/query.sh + args: + - --model <<global_vars.hf_model>> + - --tp_size 4 + - --ep_size 4 + - --max_num_tokens 32000 + - --port 8000 + - --host 0.0.0.0 + - --trust_remote_code + - -- + - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples + - --save /scratchspace/data + environment: + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 2: Dump hidden states from target model + task_1: + script: common/eagle3/dump_offline_data.sh + args: + - --input-data /scratchspace/data + - --output-dir /scratchspace/offline_hidden_states + - --max-seq-len 8192 + - --tp 4 + - --moe-ep 4 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 3: Train EAGLE3 draft head (offline, single task) + task_2: + script: common/eagle3/offline_training.sh + args: + - --offline-data /scratchspace/offline_hidden_states + - --data_path None + - --mode eagle3 + - --num_epochs 1 + - --lr 3e-4 + - --save_steps 500000 + - --output_dir /scratchspace/eagle3 + - --train_bs 8 + - --training_seq_len 4096 + - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json + - --disable_tqdm True + - --ar_validate_steps 500000 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2 + + # Step 4: Benchmark speculative decoding (VLLM backend) + task_3: + script: common/specdec_bench/quick_check.sh + args: + - --draft_model_dir /scratchspace/export + - --draft_length 3 + - --output_length 4096 + - --engine VLLM + - --tp_size 4 + - --ep_size 1 + - --speculative_algorithm EAGLE3 + - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl + - --concurrency 1 + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest diff --git a/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml new file mode 100644 index 000000000..ea83960ef --- /dev/null +++ b/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml @@ -0,0 +1,35 @@ +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + skip: false + allow_to_fail: false + note: + + # hf_local: path prefix for model weights and datasets. + # + # This should be a self-managed directory that mirrors the HuggingFace Hub + # hierarchy (e.g., /hf-local/Qwen/Qwen3-8B/, /hf-local/cais/mmlu/). Using + # a dedicated folder is preferred over the HuggingFace cache (~/.cache/huggingface) + # to avoid cache corruption issues with concurrent jobs. + # + # Override on CLI: + # pipeline.global_vars.hf_local=/mnt/my-models/ # use a different path + # pipeline.global_vars.hf_local="" # download from HuggingFace Hub + global_vars: + hf_local: /hf-local/ + + task_0: + script: common/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name <<global_vars.hf_local>>abisee/cnn_dailymail + - --calib-size 32 + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - HF_MODEL_CKPT: <<global_vars.hf_local>>Qwen/Qwen3-8B + - MMLU_DATASET: <<global_vars.hf_local>>cais/mmlu + - TP: 4 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 diff --git a/launcher/README.md b/launcher/README.md new file mode 100644 index 000000000..d5365a2fc --- /dev/null +++ b/launcher/README.md @@ -0,0 +1,298 @@ +# ModelOpt Launcher + +Submit ModelOpt quantization, training, and evaluation jobs to Slurm clusters or run them locally with Docker. + +## Quick Start + +```bash +# Install dependencies +curl -LsSf https://astral.sh/uv/install.sh | sh +git submodule update --init --recursive + +# Run locally (requires local GPUs and Docker) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + +# Run on a Slurm cluster +export SLURM_HOST=login-node.example.com +export SLURM_ACCOUNT=my_account +export SLURM_HF_LOCAL=/shared/hf-local +export SLURM_JOB_DIR=/shared/experiments +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` + +## Environment Variables + +| Variable | Description | Required | +|---|---|---| +| `SLURM_HOST` | Slurm login node hostname | Yes (remote jobs) | +| `SLURM_ACCOUNT` | Slurm account for billing | Yes (remote jobs) | +| `SLURM_JOB_DIR` | Remote directory for job artifacts | Yes (remote jobs) | +| `SLURM_HF_LOCAL` | Path to HuggingFace model cache on the cluster | Yes (remote jobs) | +| `HF_TOKEN` | HuggingFace API token | No | +| `NEMORUN_HOME` | NeMo Run home directory (default: cwd) | No | + +## Model and Dataset Storage (`hf_local`) + +Pipeline YAMLs use a `global_vars.hf_local` path prefix for model weights and datasets. This should be a **self-managed directory that mirrors the HuggingFace Hub hierarchy**: + +```text +/hf-local/ +├── Qwen/Qwen3-8B/ # model weights +├── meta-llama/Llama-3.1-8B/ # model weights +├── abisee/cnn_dailymail/ # calibration dataset +└── cais/mmlu/ # evaluation dataset +``` + +Using a dedicated folder is preferred over the HuggingFace cache (`~/.cache/huggingface`) to avoid cache corruption from concurrent jobs writing to the same cache directory. + +You can populate it by copying or symlinking from an existing HuggingFace download: + +```bash +# Example: download a model and copy to hf_local +huggingface-cli download Qwen/Qwen3-8B --local-dir /hf-local/Qwen/Qwen3-8B +``` + +Override `hf_local` in any YAML via CLI: + +```bash +# Use a different local path +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.global_vars.hf_local=/mnt/my-models/ --yes + +# Download from HuggingFace Hub directly (no local cache) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.global_vars.hf_local="" --yes +``` + +For Slurm clusters, `SLURM_HF_LOCAL` sets the container mount path (e.g., `/lustre/.../hf-local:/hf-local`). + +## Directory Structure + +```text +launcher/ +├── launch.py # Main entrypoint +├── core.py # Shared logic (also used by nmm-sandbox's slurm.py) +├── slurm_config.py # SlurmConfig dataclass and factory +├── pyproject.toml # Dependencies (nemo-run, pyyaml) +├── common/ # Shared scripts executed on the cluster +│ ├── service_utils.sh # Error handling, MPI rank utilities +│ ├── query.py # OpenAI-compatible query client +│ ├── megatron-lm/quantize/ +│ │ └── quantize.sh # PTQ quantization + MMLU evaluation +│ ├── tensorrt-llm/query.sh # TRT-LLM server launch + query +│ ├── vllm/query.sh # vLLM server launch + query +│ ├── eagle3/ # EAGLE3 speculative decoding scripts +│ └── specdec_bench/ # Speculative decoding benchmark +├── Qwen/Qwen3-8B/ # Example configs +│ ├── megatron_lm_ptq.yaml # PTQ quantization pipeline +│ └── hf_offline_eagle3.yaml # EAGLE3 offline pipeline +└── modules/ # Dependencies + ├── Megatron-LM/ # Git submodule: NVIDIA Megatron-LM + └── Model-Optimizer -> ../.. # Symlink to parent (auto-created if missing) +``` + +> **Note:** `modules/Model-Optimizer` is a symlink to the parent directory (`../..`), +> not a submodule. This avoids recursive nesting. `launch.py` auto-creates +> the symlink on first run if it's missing. + +## YAML Config Format + +A config YAML defines the job name, pipeline metadata, and one or more tasks: + +```yaml +job_name: Qwen3-8B_NVFP4_DEFAULT_CFG +pipeline: + skip: false + allow_to_fail: false + note: + + task_0: + script: common/megatron-lm/quantize/quantize.sh + args: + - --calib-dataset-path-or-name /hf-local/abisee/cnn_dailymail + - --calib-size 32 + environment: + - MLM_MODEL_CFG: Qwen/Qwen3-8B + - QUANT_CFG: NVFP4_DEFAULT_CFG + - TP: 4 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 4 + gpus_per_node: 4 +``` + +### Multi-task Pipeline + +Tasks run sequentially — `task_1` starts only after `task_0` completes. +Example (illustrative — export script may not exist yet): + +```yaml +job_name: Qwen3-8B_quantize_export +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + task_0: + script: common/megatron-lm/quantize/quantize.sh + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + + task_1: + script: common/megatron-lm/export/export.sh + environment: + - HF_MODEL_CKPT: <<global_vars.hf_model>> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 +``` + +The `<<global_vars.X>>` syntax shares values across tasks. + +### `--yaml` vs `pipeline=@` + +There are two ways to load a config: + +**`--yaml config.yaml`** (recommended) — the YAML maps top-level keys to function arguments. +The file contains both `job_name` and `pipeline`: + +```yaml +# config.yaml — used with: uv run launch.py --yaml config.yaml --yes +job_name: Qwen3-8B_NVFP4 +pipeline: + task_0: + script: common/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +**`pipeline=@config.yaml`** — the YAML is a bare `SandboxPipeline` (no `job_name` or `pipeline` wrapper). +This is useful for reusing pipeline configs across different job names: + +```yaml +# bare_pipeline.yaml — used with: uv run launch.py pipeline=@bare_pipeline.yaml --yes +task_0: + script: common/megatron-lm/quantize/quantize.sh + slurm_config: + _factory_: "slurm_factory" +``` + +```bash +# With pipeline=@, set job_name separately +uv run launch.py pipeline=@bare_pipeline.yaml job_name=my_job --yes +``` + +### Overriding Parameters + +Any parameter can be overridden from the command line: + +```bash +# Change the number of nodes +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.task_0.slurm_config.nodes=2 --yes + +# Change the container image +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml \ + pipeline.task_0.slurm_config.container=nvcr.io/nvidia/tensorrt-llm/release:1.3.0 --yes +``` + +### Useful Flags + +| Flag | Description | +|---|---| +| `--yes` / `-y` | Skip confirmation prompt | +| `-v` | Verbose output | +| `--dryrun` | Resolve and print the full config without running | +| `--to-yaml output.yaml` | Dump the resolved config to a YAML file without running | +| `detach=true` | Submit the job and return immediately (don't wait for completion) | + +```bash +# Preview the resolved config (all factory defaults expanded) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --dryrun --yes -v + +# Dump resolved config to file for inspection or reproducibility +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml resolved.yaml + +# Submit and detach +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml detach=true --yes +``` + +## Adding a New Model + +1. Create a directory: `<Organization>/<ModelName>/` +2. Add a YAML config (e.g., `megatron_lm_ptq.yaml`) following the format above +3. Set `MLM_MODEL_CFG` to the HuggingFace model ID +4. Choose `QUANT_CFG` (e.g., `NVFP4_DEFAULT_CFG`, `INT8_DEFAULT_CFG`) +5. Set `nodes`, `ntasks_per_node`, `gpus_per_node` based on model size + +## How It Works + +1. `launch.py` parses the YAML and creates a `SandboxPipeline` with tasks and `SlurmConfig` +2. Code is packaged via `PatternPackager` — `modules/Megatron-LM/`, `modules/Model-Optimizer/` (via symlink), and `common/` are synced +3. For remote jobs: code is rsynced to the cluster, an sbatch script is generated and submitted via SSH +4. For local jobs: a Docker container is launched with the same container image and mounts +5. The `code/` directory on the cluster mirrors the launcher structure: + +```text +code/ +├── modules/ +│ ├── Megatron-LM/megatron/... +│ └── Model-Optimizer/modelopt/... +└── common/... +``` + +## Running Tests + +```bash +cd launcher +uv pip install pytest +uv run python3 -m pytest ../tests/unit/launcher/ -v -o "addopts=" \ + --confcutdir=../tests/unit/launcher +``` + +64 unit tests cover core dataclasses, factory registry, YAML parsing, Docker/Slurm executor construction, environment merging, and end-to-end Docker launch. + +## Reporting Bugs + +When filing a bug report, please include: + +1. **Version summary** — printed at the start of every run: + + ```text + ============================================================ + Version Report + ============================================================ + Launcher d28acd33 (main) + Megatron-LM 1e064f361 (main) + Model-Optimizer 69c0d479 (main) + ============================================================ + ``` + +2. **Reproducible config** — dump with `--to-yaml`: + + ```bash + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --to-yaml bug_report.yaml + ``` + +3. **Error output** — the relevant error message or traceback from the job log. + +File issues at: <https://github.com/NVIDIA/Model-Optimizer/issues> + +## Compatibility with nmm-sandbox + +This launcher produces the same `code/` layout as [nmm-sandbox](https://gitlab-master.nvidia.com/omniml/integration/nmm-sandbox)'s `slurm.py`. The same YAML configs work with both: + +```bash +# From nmm-sandbox (internal) +uv run slurm.py --yaml modules/Model-Optimizer/launcher/Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + +# From Model-Optimizer/launcher (public) +uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes +``` + +Verified: identical MMLU results (0.719 local, 0.730 OCI-HSG) from both launchers. + +For architecture details, factory system, and Claude Code workflows, see [ADVANCED.md](ADVANCED.md). diff --git a/launcher/__init__.py b/launcher/__init__.py new file mode 100644 index 000000000..11b92d8b7 --- /dev/null +++ b/launcher/__init__.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters.""" diff --git a/launcher/common/eagle3/dump_offline_data.sh b/launcher/common/eagle3/dump_offline_data.sh new file mode 100644 index 000000000..a11f7f7ed --- /dev/null +++ b/launcher/common/eagle3/dump_offline_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +trtllm-llmapi-launch python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py \ + --model ${HF_MODEL_CKPT} \ + --dp-rank ${TASK_ID} \ + --dp-world-size ${TASK_COUNT} \ + ${@} diff --git a/launcher/common/eagle3/offline_training.sh b/launcher/common/eagle3/offline_training.sh new file mode 100644 index 000000000..4dfe2de7c --- /dev/null +++ b/launcher/common/eagle3/offline_training.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/service_utils.sh + +pip install -r modules/Model-Optimizer/examples/speculative_decoding/requirements.txt +pip install huggingface-hub>=1.2.1 +export PATH=$PATH:/workspace/.local/bin + +################################################################################################### + +trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER + +bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \ + --model ${HF_MODEL_CKPT} \ + ${@} + +python modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \ + --model_path /scratchspace/eagle3 \ + --export_path /scratchspace/export + +################################################################################################### + +# This function handles the exit status (fails the CI). +#exit_handler $0 diff --git a/launcher/common/megatron-lm/quantize/quantize.sh b/launcher/common/megatron-lm/quantize/quantize.sh new file mode 100755 index 000000000..6e4d21b99 --- /dev/null +++ b/launcher/common/megatron-lm/quantize/quantize.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/../../service_utils.sh + +util_install_extra_dep + +trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER +################################################################################################### + +if [[ -z ${HF_MODEL_CKPT} ]]; then + export HF_MODEL_CKPT="/hf-local/${MLM_MODEL_CFG}" +fi +export MLM_MODEL_SAVE="/scratchspace/megatron-lm/${MLM_MODEL_CFG}" +export EXPORT_DIR="/scratchspace/export/${MLM_MODEL_CFG}_${QUANT_CFG}" +export MLM_SKIP_INSTALL=1 + +QUANTIZE_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/quantize.sh" +MMLU_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/mmlu.sh" +CONVERT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/convert.sh" +EXPORT_EXE="bash modules/Megatron-LM/examples/post_training/modelopt/export.sh" + +export MLM_EXTRA_ARGS=${@} +${QUANTIZE_EXE} ${MLM_MODEL_CFG} ${QUANT_CFG} + +export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound 0.38 --disable-tqdm" +MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG} + +################################################################################################### + +# This function handles the exit status (fails the CI). +exit_handler $0 diff --git a/launcher/common/query.py b/launcher/common/query.py new file mode 100644 index 000000000..79ec93f54 --- /dev/null +++ b/launcher/common/query.py @@ -0,0 +1,147 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D100,D101,D102,D103,D107,F841,PLR1722 +import argparse +import os + +from datasets import load_dataset +from openai import OpenAI + +early_termination = False + + +class LLM: + def __init__(self, args): + self.args = args + self.client = OpenAI(base_url=args.base_url) + self.generate(messages=[{"role": "user", "content": "Hello! /no_think"}], verbose=True) + + def generate(self, messages, verbose=False, **chat_template_kwargs): + try: + completion = self.client.chat.completions.create( + model=self.args.model, + messages=messages, + temperature=self.args.temperature, + ) + new_message = completion.choices[0].message.content + if verbose: + for msg in messages: + print("[OLD] {:10}: {:64}".format(msg["role"], msg["content"])) + print("[NEW] {:10}: {:64}\n\n".format("assistant", new_message)) + + new_message = {"role": "assistant", "content": new_message} + except Exception as e: + print(e) + + if "Connection error" in str(e): + early_termination = True + + new_message = None + + return new_message + + +parser = argparse.ArgumentParser(prog="query") +parser.add_argument("base_url", type=str, help="url to the OpenAI compatible API.") +parser.add_argument("model", type=str, help="model name") +parser.add_argument( + "--data", type=str, default=None, help="path to OAI chat data (local or HF hub)" +) +parser.add_argument("--data-split", type=str, default="train", help="HF dataset split") +parser.add_argument("--save", type=str, default=None, help="path to store the generated output.") +parser.add_argument("--num-shards", type=int, default=1000, help="number of shards.") +parser.add_argument("--shard-id-begin", type=int, default=0, help="the shard id to start.") +parser.add_argument( + "--shard-id-step", type=int, default=1, help="the step that the shard id progress." +) +parser.add_argument("--num-proc", type=int, default=32, help="number of processes (concurrency).") +parser.add_argument("--temperature", type=float, default=0.0, help="temperature.") +args = parser.parse_args() + +llm = LLM(args) + +if args.data is None: + exit(0) + + +def disable_thinking_column(data): + data.update({"enable_thinking": False}) + return data + + +def synthesize(data): + messages = data.get("conversations", None) + if messages is None: + messages = data.get("messages", None) + if messages is None: + raise ValueError( + "No conversations of messages in the data. Only OAI chat data is supported." + ) + + # Handle generation specific kwargs. + enable_thinking = data.get("enable_thinking", True) + + current_messages = [] + + for msg in messages: + if msg["role"] == "system": + current_messages.append(msg) + elif msg["role"] == "user": + if not enable_thinking: + msg["content"] = msg["content"] + " /no_think" + + current_messages.append(msg) + new_message = llm.generate(current_messages, verbose=False) + if new_message is None: + break + else: + current_messages.append(new_message) + elif msg["role"] == "assistant": + # Original assistant messages are not used + pass + else: + raise ValueError("unknown role: {}".format(msg["role"])) + + return {"conversations": current_messages} + + +dataset = load_dataset(args.data, split=args.data_split) + +if args.num_shards * 100 > len(dataset): + args.num_shards = min(16, len(dataset) // 100) + +if args.save is not None: + print("Create save dir: {}".format(args.save)) + os.makedirs(args.save, exist_ok=True) + +for shard_id in range(args.shard_id_begin, args.num_shards, args.shard_id_step): + file_path = args.save + "/train-{:05}-{:05}.jsonl".format(shard_id + 1, args.num_shards) + + if os.path.exists(file_path): + continue + + shard = dataset.shard(num_shards=args.num_shards, index=shard_id) + print(len(shard), file_path) + + if shard_id % 2 == 0: + shard = shard.map(disable_thinking_column, num_proc=args.num_proc) + updated_shard = shard.map(synthesize, num_proc=args.num_proc) + updated_shard.to_json(file_path) + print(updated_shard[0]) + + if early_termination: + print("Terminate earlier due to server connection error!") + break diff --git a/launcher/common/service_utils.sh b/launcher/common/service_utils.sh new file mode 100755 index 000000000..f9d15b279 --- /dev/null +++ b/launcher/common/service_utils.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +native_mpi_rank=$OMPI_COMM_WORLD_RANK +native_mpi_local_rank=$OMPI_COMM_WORLD_LOCAL_RANK +# Works with Slurm launching with `--mpi=pmix` +mpi_rank=${PMIX_RANK:-$native_mpi_rank} +mpi_local_rank=${PMIX_LOCAL_RANK:-$native_mpi_local_rank} + +FAIL=0 +FAIL_EXIT=0 + +function error_handler { + local last_status_code=$? + echo "[ERROR] $1:$2 failed with status $last_status_code." >&2 + + if [[ "$mpi_rank" -eq 0 ]]; then + echo "<REPORT>$1:$2</REPORT>" >&2 + fi + FAIL=1 + FAIL_EXIT=1 +} + +function exit_handler { + if [[ $FAIL_EXIT == 1 ]]; then + exit 1 + fi +} + +function report_result { + if [[ "$mpi_rank" -eq 0 ]]; then + echo "<REPORT>$1</REPORT>" + fi +} + +function util_install_extra_dep { + if [[ "$mpi_local_rank" -eq 0 ]]; then + pip install diskcache + fi +} + +LOCAL_NUM_GPUS=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1) +printf "RANK ${mpi_rank} GPU count: ${LOCAL_NUM_GPUS}\n" + +# Increase the modelopt version number manually +if [[ "$mpi_local_rank" -eq 0 ]]; then + echo "__version__ = '1.0.0'" >> ./modules/Model-Optimizer/modelopt/__init__.py +fi diff --git a/launcher/common/specdec_bench/quick_check.sh b/launcher/common/specdec_bench/quick_check.sh new file mode 100644 index 000000000..d90413969 --- /dev/null +++ b/launcher/common/specdec_bench/quick_check.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### + + +${TRTLLM_LAUNCH_SCRIPT} python3 modules/Model-Optimizer/examples/specdec_bench/run.py \ + --model_dir ${HF_MODEL_CKPT} \ + --tokenizer ${HF_MODEL_CKPT} \ + ${@} diff --git a/launcher/common/tensorrt-llm/query.sh b/launcher/common/tensorrt-llm/query.sh new file mode 100644 index 000000000..3bc2ec106 --- /dev/null +++ b/launcher/common/tensorrt-llm/query.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### +# Usage: +# query.sh --model MODEL [SERVE_ARGS...] -- [QUERY_ARGS...] +# +# Launches trtllm-serve with the given model, waits for it to be ready, +# then runs common/query.py against the server. +# +# --model MODEL is required and is consumed by this script. It is used as the +# positional model argument for both trtllm-serve and common/query.py. +# +# Remaining arguments are split on "--": +# - Args BEFORE "--" are appended to the trtllm-serve command (SERVE_ARGS). +# - Args AFTER "--" are passed to common/query.py (QUERY_ARGS). +# - If "--" is absent, all remaining args go to common/query.py. +# +# Environment variables (optional, set by Slurm): +# SLURM_ARRAY_TASK_ID Used to shard query.py work across array jobs. +# SLURM_ARRAY_TASK_COUNT Total number of array tasks for sharding. +# +# In a pipeline YAML task config: +# args: +# - --model /hf-local/Qwen/Qwen3-8B # required +# - --tp_size 4 # trtllm-serve args (before --) +# - --ep_size 4 +# - --max_num_tokens 32000 +# - --port 8000 +# - --host 0.0.0.0 +# - --trust_remote_code +# - -- # separator +# - --data /hf-local/dataset # query.py args (after --) +# - --save /scratchspace/data +################################################################################################### + +export OPENAI_API_KEY="token-abc123" + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +# Parse --model and split remaining args on "--". +# --model is consumed here; args before "--" go to trtllm-serve, args after go to query.py. +MODEL="" +SERVE_EXTRA_ARGS=() +QUERY_ARGS=(--shard-id-begin ${TASK_ID} --shard-id-step ${TASK_COUNT}) +past_separator=false +skip_next=false + +for arg in "$@"; do + if $skip_next; then + MODEL="$arg" + skip_next=false + elif [ "$arg" = "--model" ]; then + skip_next=true + elif [ "$arg" = "--" ]; then + past_separator=true + elif [ "$past_separator" = false ]; then + SERVE_EXTRA_ARGS+=("$arg") + else + QUERY_ARGS+=("$arg") + fi +done + +trtllm-llmapi-launch trtllm-serve \ + ${MODEL} \ + "${SERVE_EXTRA_ARGS[@]}" \ + & + + +# Wait for server to start up by polling the health endpoint +echo "Waiting for server to start..." +while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [ "$response" -eq 200 ]; then + echo "Server is up!" + break + fi + echo "Server not ready yet, retrying in 10 seconds..." + sleep 10 +done + +if [[ "$mpi_rank" -eq 0 ]]; then + cmd="python common/query.py http://localhost:8000/v1 ${MODEL} ${QUERY_ARGS[*]}" + echo "Running command: $cmd" + eval $cmd + echo "Main process exit" +else + while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [[ "$response" -ne 200 ]]; then + break + fi + #echo "Server is up!" + sleep 60 + done +fi + +pkill trtllm-serve + +exit 0 diff --git a/launcher/common/vllm/query.sh b/launcher/common/vllm/query.sh new file mode 100755 index 000000000..d203e8994 --- /dev/null +++ b/launcher/common/vllm/query.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +source ${SCRIPT_DIR}/../service_utils.sh + +################################################################################################### +# Usage: +# query.sh --model MODEL [SERVE_ARGS...] -- [QUERY_ARGS...] +# +# Launches vllm serve with the given model, waits for it to be ready, +# then runs common/query.py against the server. +# +# --model MODEL is required and is consumed by this script. It is used as the +# positional model argument for both vllm serve and common/query.py. +# +# Remaining arguments are split on "--": +# - Args BEFORE "--" are appended to the vllm serve command (SERVE_ARGS). +# - Args AFTER "--" are passed to common/query.py (QUERY_ARGS). +# - If "--" is absent, all remaining args go to common/query.py. +# +# Environment variables (optional, set by Slurm): +# SLURM_ARRAY_TASK_ID Used to shard query.py work across array jobs. +# SLURM_ARRAY_TASK_COUNT Total number of array tasks for sharding. +# +# vLLM notes: +# - vLLM manages GPU distribution internally; run with ntasks_per_node: 1 +# in slurm_config and pass --tensor-parallel-size to match gpus_per_node. +# - NVFP4 models require vllm/vllm-openai:v0.15.0+ on Blackwell GPUs. +# - Use --trust-remote-code for models with custom architectures (e.g. Kimi). +# +# In a pipeline YAML task config: +# args: +# - --model /hf-local/Qwen/Qwen3-8B # required +# - --tensor-parallel-size 4 # vllm serve args (before --) +# - --max-num-seqs 32 +# - --trust-remote-code +# - -- # separator +# - --data /hf-local/dataset # query.py args (after --) +# - --save /scratchspace/data +# slurm_config: +# ntasks_per_node: 1 # vLLM is single-process +# gpus_per_node: 4 +################################################################################################### + +export OPENAI_API_KEY="token-abc123" + +if [ -z ${SLURM_ARRAY_TASK_ID} ]; then + TASK_ID=0 +else + echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}" + TASK_ID=${SLURM_ARRAY_TASK_ID} +fi + +if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then + TASK_COUNT=1 +else + echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}" + TASK_COUNT=${SLURM_ARRAY_TASK_COUNT} +fi + +# Parse --model and split remaining args on "--". +# --model is consumed here; args before "--" go to vllm serve, args after go to query.py. +MODEL="" +SERVE_EXTRA_ARGS=() +QUERY_ARGS=(--shard-id-begin ${TASK_ID} --shard-id-step ${TASK_COUNT}) +past_separator=false +skip_next=false + +for arg in "$@"; do + if $skip_next; then + MODEL="$arg" + skip_next=false + elif [ "$arg" = "--model" ]; then + skip_next=true + elif [ "$arg" = "--" ]; then + past_separator=true + elif [ "$past_separator" = false ]; then + SERVE_EXTRA_ARGS+=("$arg") + else + QUERY_ARGS+=("$arg") + fi +done + +# vLLM is single-process: GPU parallelism is handled internally via --tensor-parallel-size. +# No MPI multi-rank logic needed; this script always runs as a single task. +vllm serve \ + ${MODEL} \ + "${SERVE_EXTRA_ARGS[@]}" \ + & +SERVER_PID=$! + + +# Wait for server to start up by polling the health endpoint +echo "Waiting for server to start..." +while true; do + response=$(curl -s -o /dev/null -w "%{http_code}" "http://$(hostname -f):8000/health" || true) + if [ "$response" -eq 200 ]; then + echo "Server is up!" + break + fi + echo "Server not ready yet, retrying in 10 seconds..." + sleep 10 +done + +cmd="python common/query.py http://localhost:8000/v1 ${MODEL} ${QUERY_ARGS[*]}" +echo "Running command: $cmd" +eval $cmd +echo "Main process exit" + +kill $SERVER_PID +wait $SERVER_PID 2>/dev/null || true + +exit 0 diff --git a/launcher/core.py b/launcher/core.py new file mode 100644 index 000000000..de2f5b061 --- /dev/null +++ b/launcher/core.py @@ -0,0 +1,488 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared core logic for ModelOpt launcher and nmm-sandbox slurm.py. + +This module contains all dataclasses, executor builders, and the job run loop +shared between the public launcher (launch.py) and the internal CI orchestrator +(slurm.py). Each caller provides its own config (packager, defaults, experiment +title) and thin entrypoint. +""" + +import dataclasses +import getpass +import json +import os +import re +from dataclasses import dataclass + +import nemo_run as run +import yaml + +# --------------------------------------------------------------------------- +# Default environment variables injected into every job +# --------------------------------------------------------------------------- + +DEFAULT_EXPERIMENT_TITLE = "cicd" + + +def get_default_env(experiment_title=None): + """Return (slurm_env, local_env) dicts for the given experiment title.""" + title = experiment_title or DEFAULT_EXPERIMENT_TITLE + slurm_env = { + "TRITON_CACHE_DIR": f"/{title}/triton-cache", + "HF_HOME": f"/{title}/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", + "LAUNCH_SCRIPT": "python", + } + local_env = { + "TRITON_CACHE_DIR": f"/{title}/triton-cache", + "HF_HOME": f"/{title}/hf-cache", + "HF_TOKEN": os.getenv("HF_TOKEN", ""), + "MLM_SKIP_INSTALL": "1", + } + return slurm_env, local_env + + +# SlurmConfig type — set by the caller via set_slurm_config_type() before use. +# This allows both slurm.py and launch.py to use their own SlurmConfig class. +_SLURM_CONFIG_TYPE = None +_FACTORY_REGISTRY = {} + + +def set_slurm_config_type(cls): + """Register the SlurmConfig dataclass type used by SandboxTask.""" + global _SLURM_CONFIG_TYPE + _SLURM_CONFIG_TYPE = cls + # Patch SandboxTask's type annotation so nemo-run's CLI parser can resolve factories + SandboxTask.__dataclass_fields__["slurm_config"].type = cls + SandboxTask.__annotations__["slurm_config"] = cls + + +def register_factory(name, fn): + """Register a factory function by name for task_configs YAML resolution.""" + _FACTORY_REGISTRY[name] = fn + + +# --------------------------------------------------------------------------- +# Task and pipeline dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class SandboxTask: + """A single task with a script, slurm config, args, and environment.""" + + script: str = None + slurm_config: object = None # Patched at runtime by set_slurm_config_type() + args: list[str] = None + environment: list[dict[str, str]] = None + yaml_file: str = None + skip: bool = False + + +@dataclass +class SandboxTask0(SandboxTask): + """Task slot 0 in a pipeline.""" + + +@dataclass +class SandboxTask1(SandboxTask): + """Task slot 1 in a pipeline.""" + + +@dataclass +class SandboxTask2(SandboxTask): + """Task slot 2 in a pipeline.""" + + +@dataclass +class SandboxTask3(SandboxTask): + """Task slot 3 in a pipeline.""" + + +@dataclass +class SandboxTask4(SandboxTask): + """Task slot 4 in a pipeline.""" + + +def create_task_from_yaml(yaml_file, factory_lookup): + """Create a SandboxTask from a YAML config file. + + Args: + yaml_file: Path to the YAML config. + factory_lookup: Dict mapping factory names to callable factory functions. + """ + with open(yaml_file) as file: + config_from_yaml = yaml.safe_load(file) + + script = config_from_yaml["script"] + function_name = config_from_yaml["slurm_config"].pop("_factory_") + slurm_config = factory_lookup[function_name](**config_from_yaml["slurm_config"]) + args = config_from_yaml.get("args", None) + environment = config_from_yaml.get("environment", None) + + return SandboxTask(script=script, slurm_config=slurm_config, args=args, environment=environment) + + +@dataclass +class GlobalVariables: + """Shared variables for <<global_vars.X>> interpolation in pipeline YAMLs.""" + + hf_model: str = None + hf_data: str = None + hf_local: str = None + + +@dataclass +class SandboxPipeline: + """A multi-task pipeline with shared global variables and task dependencies.""" + + global_vars: GlobalVariables = None + + task_0: SandboxTask0 = None + task_1: SandboxTask1 = None + task_2: SandboxTask2 = None + task_3: SandboxTask3 = None + task_4: SandboxTask4 = None + tasks: list[SandboxTask] = None + + test_level: int = 0 + allow_to_fail: bool = False + skip: bool = False + note: str = "" + task_configs: list[str] = None + experiment = None + + # Set by caller — used by create_task_from_yaml + _factory_lookup: dict = None + + def __post_init__(self): + """Collect tasks from slots/configs and resolve <<global_vars.X>> references.""" + if self.tasks is None: + self.tasks = [] + for i in range(5): + task = getattr(self, f"task_{i}", None) + if task is not None: + self.tasks += [task] + if self.task_configs is not None: + lookup = self._factory_lookup or _FACTORY_REGISTRY + if lookup: + self.tasks += [ + create_task_from_yaml(yaml_file=yf, factory_lookup=lookup) + for yf in self.task_configs + ] + + if self.global_vars is not None: + global_vars_dict = { + k: v for k, v in dataclasses.asdict(self.global_vars).items() if v is not None + } + + def _resolve(s): + """Replace <<global_vars.X>> with the corresponding value.""" + if not isinstance(s, str): + return s + return re.sub( + r"<<global_vars\.(\w+)>>", + lambda m: global_vars_dict.get(m.group(1), m.group(0)), + s, + ) + + for task in self.tasks: + if task.environment: + if isinstance(task.environment, list): + task.environment = [ + {k: _resolve(v) for k, v in item.items()} for item in task.environment + ] + else: + task.environment = {k: _resolve(v) for k, v in task.environment.items()} + if task.args: + task.args = [_resolve(a) for a in task.args] + + +# --------------------------------------------------------------------------- +# Executor builders +# --------------------------------------------------------------------------- + + +def build_slurm_executor( + user, + identity, + slurm_config, + experiment_id, + job_dir, + task_name, + packager, + experiment_title="cicd", +): + """Build a SlurmExecutor for remote job submission.""" + container_mounts = list(slurm_config.container_mounts or []) + + scratch_dst = "/scratchspace" + scratch_src = f"{job_dir}/{experiment_title}/{experiment_id}" + modelopt_dst = slurm_config.modelopt_install_path + modelopt_src = ( + f"{job_dir}/{experiment_title}/{experiment_id}" + f"/{task_name}/code/modules/Model-Optimizer/modelopt" + ) + container_mounts += [ + f"{scratch_src}:{scratch_dst}", + f"{modelopt_src}:{modelopt_dst}", + f"{job_dir}/{experiment_title}:/{experiment_title}", + ] + + tunnel = run.SSHTunnel( + host=slurm_config.host, + user=getpass.getuser() if user is None else user, + port=slurm_config.port, + job_dir=job_dir, + identity=identity, + ) + + executor = run.SlurmExecutor( + account=slurm_config.account, + partition=slurm_config.partition, + ntasks_per_node=slurm_config.ntasks_per_node, + gpus_per_node=slurm_config.gpus_per_node, + nodes=slurm_config.nodes, + tunnel=tunnel, + container_image=slurm_config.container, + container_mounts=container_mounts, + array=slurm_config.array, + time="04:00:00", + mem="0", + retries=0, + packager=packager, + srun_args=slurm_config.srun_args, + ) + return executor + + +def build_docker_executor( + hf_local, + slurm_config, + experiment_id, + job_dir, + task_name, + packager, + modelopt_src_path=None, + experiment_title="cicd", +): + """Build a DockerExecutor for local GPU jobs.""" + if slurm_config.local: + container_mounts = list(slurm_config.container_mounts or []) + else: + container_mounts = [] + container_mounts += [f"{hf_local}:/hf-local"] + + scratch_dst = "/scratchspace" + scratch_src = os.path.join(job_dir, experiment_title, experiment_id, task_name) + os.makedirs(scratch_src, exist_ok=True) + modelopt_dst = slurm_config.modelopt_install_path + if modelopt_src_path is None: + modelopt_src_path = os.path.join(os.getcwd(), "modules/Model-Optimizer/modelopt") + exp_title_src = os.path.join(job_dir, experiment_title) + os.makedirs(exp_title_src, exist_ok=True) + container_mounts += [ + f"{scratch_src}:{scratch_dst}", + f"{modelopt_src_path}:{modelopt_dst}", + f"{exp_title_src}:/{experiment_title}", + ] + + executor = run.DockerExecutor( + num_gpus=-1, + runtime="nvidia", + ipc_mode="host", + container_image=slurm_config.container, + volumes=container_mounts, + additional_kwargs={"user": f"{os.getuid()}:{os.getgid()}"}, + packager=packager, + ) + return executor + + +# --------------------------------------------------------------------------- +# Version reporting +# --------------------------------------------------------------------------- + + +def _git_info(path): + """Get git commit hash and branch for a directory.""" + import subprocess # nosec B404 + + try: + commit = subprocess.run( # nosec B603 B607 + ["git", "rev-parse", "--short", "HEAD"], + cwd=path, + capture_output=True, + text=True, + timeout=5, + ).stdout.strip() + branch = subprocess.run( # nosec B603 B607 + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=path, + capture_output=True, + text=True, + timeout=5, + ).stdout.strip() + return commit, branch + except Exception: + return "unknown", "unknown" + + +def report_versions(base_dir): + """Print git commit and branch for the launcher and all submodules.""" + print("=" * 60) + print("Version Report") + print("=" * 60) + + # Launcher / repo root + commit, branch = _git_info(base_dir) + print(f" {'Launcher':<30} {commit:<12} ({branch})") + + # Submodules + modules_dir = os.path.join(base_dir, "modules") + if os.path.isdir(modules_dir): + for name in sorted(os.listdir(modules_dir)): + sub_path = os.path.join(modules_dir, name) + if os.path.exists(os.path.join(sub_path, ".git")): + commit, branch = _git_info(sub_path) + print(f" {name:<30} {commit:<12} ({branch})") + + print("=" * 60) + + +# --------------------------------------------------------------------------- +# Shared job run loop +# --------------------------------------------------------------------------- + + +def run_jobs( + job_table, + hf_local, + user, + identity, + job_dir, + packager, + default_slurm_env, + default_local_env, + experiment_title="cicd", + detach=False, + test_level=0, + modelopt_src_path=None, + base_dir=None, +): + """Run all jobs in job_table. + + Args: + job_table: Dict mapping job_name -> SandboxPipeline. + hf_local: Path to local HF cache (None for remote Slurm). + user: SSH user. + identity: SSH identity file. + job_dir: Base directory for job artifacts. + packager: PatternPackager instance. + default_slurm_env: Default env vars for Slurm jobs. + default_local_env: Default env vars for local Docker jobs. + experiment_title: Experiment title (e.g., "cicd" or "modelopt"). + detach: Whether to detach from the experiment. + test_level: Only run jobs with test_level <= this value. + modelopt_src_path: Path to modelopt source for Docker mounts. + base_dir: Base directory for version reporting (default: cwd). + """ + report_versions(base_dir or os.getcwd()) + + for job_name, job in job_table.items(): + if job.test_level > test_level: + job.skip = True + if job.skip: + continue + + dependency = None + exp = run.Experiment(experiment_title, log_level="INFO") + job.experiment = exp + + with exp: + for task_id, task in enumerate(job.tasks): + if task.skip: + print(f"job {job_name} task {task_id}: skipped") + continue + task_name = f"{job_name}_{task_id}" + task_args = [] if task.args is None else task.args + + task_env = {} + if task.environment is not None: + if isinstance(task.environment, list): + for item in task.environment: + task_env.update(item.items()) + else: + task_env = task.environment + for k, v in task_env.items(): + task_env[k] = "" if v is None else str(v) + + if hf_local is not None: + executor = build_docker_executor( + hf_local, + task.slurm_config, + exp._id, + job_dir, + task_name, + packager, + modelopt_src_path, + experiment_title, + ) + task_env.update(default_local_env) + else: + executor = build_slurm_executor( + user, + identity, + task.slurm_config, + exp._id, + job_dir, + task_name, + packager, + experiment_title, + ) + task_env.update(default_slurm_env) + + task_instance = run.Script(task.script, args=task_args, env=task_env) + print(f"job {job_name} task {task_id} slurm_config: {task.slurm_config}") + + if dependency is None: + dependency = exp.add( + task_instance, tail_logs=True, name=task_name, executor=executor + ) + else: + dependency = exp.add( + task_instance, + tail_logs=True, + name=task_name, + executor=executor, + dependencies=[dependency], + ) + + exp.run(detach=detach) + + # Write metadata for downstream tools + metadata = { + "experiment_id": exp._id, + "job_name": job_name, + "allow_to_fail": job.allow_to_fail, + "note": job.note, + } + metadata_path = os.path.join("experiments", experiment_title, exp._id, "metadata.json") + os.makedirs(os.path.dirname(metadata_path), exist_ok=True) + with open(metadata_path, "w") as f: + json.dump(metadata, f) diff --git a/launcher/launch.py b/launcher/launch.py new file mode 100644 index 000000000..934104264 --- /dev/null +++ b/launcher/launch.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters. + +Usage: + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml --yes + uv run launch.py --yaml Qwen/Qwen3-8B/megatron_lm_ptq.yaml hf_local=/mnt/hf-local --yes + +Environment variables: + SLURM_HOST Slurm login node hostname (required for remote jobs) + SLURM_ACCOUNT Slurm account/partition billing (default: from YAML) + SLURM_JOB_DIR Remote directory for job artifacts + SLURM_HF_LOCAL Path to HuggingFace model cache on the cluster + HF_TOKEN HuggingFace API token + NEMORUN_HOME NeMo Run home directory (default: current working directory) +""" + +import getpass +import os +import warnings + +import nemo_run as run +from core import SandboxPipeline, get_default_env, register_factory, run_jobs, set_slurm_config_type +from slurm_config import SlurmConfig, slurm_factory + +set_slurm_config_type(SlurmConfig) +register_factory("slurm_factory", slurm_factory) + +# --------------------------------------------------------------------------- +# Launcher-specific configuration +# --------------------------------------------------------------------------- + +LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) +MODELOPT_ROOT = os.path.dirname(LAUNCHER_DIR) + +# Ensure modules/Model-Optimizer symlink exists (points to parent Model-Optimizer root) +_mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") +if not os.path.exists(_mo_symlink): + os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) + os.symlink(os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink) + +EXPERIMENT_TITLE = "cicd" +DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) + +packager = run.PatternPackager( + include_pattern=[ + "modules/Megatron-LM/megatron/*", + "modules/Megatron-LM/examples/*", + "modules/Megatron-LM/*.py", + "modules/Model-Optimizer/modelopt/*", + "modules/Model-Optimizer/examples/*", + "common/*", + ], + relative_path=[LAUNCHER_DIR] * 6, +) + +MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") + + +# --------------------------------------------------------------------------- +# Entrypoint +# --------------------------------------------------------------------------- + + +@run.cli.entrypoint +def launch( + job_name: str = "01_job", + job_dir: str = os.environ.get("SLURM_JOB_DIR", os.path.expanduser("~/experiments")), + pipeline: SandboxPipeline = None, + hf_local: str = None, # noqa: RUF013 + user: str = getpass.getuser(), + identity: str = None, # noqa: RUF013 + detach: bool = False, +) -> None: + """Launch ModelOpt jobs on Slurm or locally with Docker.""" + if "NEMORUN_HOME" not in os.environ: + warnings.warn("NEMORUN_HOME is not set. Defaulting to current working directory.") + run.config.set_nemorun_home(os.environ.get("NEMORUN_HOME", os.getcwd())) + + if hf_local is not None: + job_dir = os.path.join(os.getcwd(), "local_experiments") + + job_table = {} + if pipeline is not None: + job_table[job_name] = pipeline + else: + print("No pipeline provided. Use pipeline=@<yaml> or --yaml <yaml>.") + return + + run_jobs( + job_table=job_table, + hf_local=hf_local, + user=user, + identity=identity, + job_dir=job_dir, + packager=packager, + default_slurm_env=DEFAULT_SLURM_ENV, + default_local_env=DEFAULT_LOCAL_ENV, + experiment_title=EXPERIMENT_TITLE, + detach=detach, + modelopt_src_path=MODELOPT_SRC_PATH, + base_dir=LAUNCHER_DIR, + ) + + +if __name__ == "__main__": + run.cli.main(launch) diff --git a/launcher/modules/Megatron-LM b/launcher/modules/Megatron-LM new file mode 160000 index 000000000..1e064f361 --- /dev/null +++ b/launcher/modules/Megatron-LM @@ -0,0 +1 @@ +Subproject commit 1e064f361256f34bf179c0cb808fd6287538f85a diff --git a/launcher/modules/Model-Optimizer b/launcher/modules/Model-Optimizer new file mode 120000 index 000000000..c25bddb6d --- /dev/null +++ b/launcher/modules/Model-Optimizer @@ -0,0 +1 @@ +../.. \ No newline at end of file diff --git a/launcher/pyproject.toml b/launcher/pyproject.toml new file mode 100644 index 000000000..6ecc201e8 --- /dev/null +++ b/launcher/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "modelopt-launcher" +version = "0.1.0" +description = "ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters" +requires-python = ">=3.10" +dependencies = [ + "nemo-run>=0.8.0", + "pyyaml", +] + +[tool.setuptools] +py-modules = [] + +[dependency-groups] +dev = [] diff --git a/launcher/pytest.ini b/launcher/pytest.ini new file mode 100644 index 000000000..5ee647716 --- /dev/null +++ b/launcher/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/launcher/slurm_config.py b/launcher/slurm_config.py new file mode 100644 index 000000000..53e39aa42 --- /dev/null +++ b/launcher/slurm_config.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Slurm configuration and factory for the ModelOpt Launcher.""" + +import os +from dataclasses import dataclass + +import nemo_run as run + + +@dataclass +class SlurmConfig: + """Cluster-agnostic Slurm configuration. + + Users define cluster details in their YAML configs or override via CLI. + No internal cluster defaults are embedded here. + """ + + host: str = None + port: int = 22 + account: str = None + partition: str = "batch" + container: str = None + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt" + container_mounts: list[str] = None + srun_args: list[str] = None + array: str = None + nodes: int = 1 + ntasks_per_node: int = 1 + gpus_per_node: int = 1 + local: bool = False + + +@run.cli.factory +@run.autoconvert +def slurm_factory( + host: str = os.environ.get("SLURM_HOST", ""), + account: str = os.environ.get("SLURM_ACCOUNT", ""), + partition: str = "batch", + nodes: int = 1, + ntasks_per_node: int = 1, + gpus_per_node: int = 1, + container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5", + modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt", + container_mounts: list[str] = [ + "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")), + ], + srun_args: list[str] = ["--no-container-mount-home"], + array: str = None, # noqa: RUF013 +) -> SlurmConfig: + """Generic Slurm factory — configure via environment variables or CLI overrides.""" + return SlurmConfig( + host=host, + account=account, + partition=partition, + nodes=nodes, + ntasks_per_node=ntasks_per_node, + gpus_per_node=gpus_per_node, + container=container, + modelopt_install_path=modelopt_install_path, + container_mounts=container_mounts, + srun_args=srun_args, + array=array, + ) diff --git a/launcher/tests/__init__.py b/launcher/tests/__init__.py new file mode 100644 index 000000000..7c9dc907f --- /dev/null +++ b/launcher/tests/__init__.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the ModelOpt Launcher. + +Coverage: + - test_core.py: Shared dataclasses, factory registry, global_vars interpolation, + version reporting, default env generation, and the run_jobs loop (mocked). + - test_slurm_config.py: SlurmConfig dataclass defaults and slurm_factory behavior + with environment variable overrides. + - test_yaml_formats.py: YAML parsing for --yaml format, pipeline=@ format, and + task_configs resolution via registered factories. + +Not covered (requires live infrastructure): + - Actual Slurm job submission (SSH tunnel, sbatch) + - Docker container launch + - nemo experiment status/logs polling + - PatternPackager tar.gz creation and rsync +""" diff --git a/launcher/tests/conftest.py b/launcher/tests/conftest.py new file mode 100644 index 000000000..bb6ccb045 --- /dev/null +++ b/launcher/tests/conftest.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fixtures for launcher unit tests. + +Run from the launcher directory: + cd Model-Optimizer/launcher + uv pip install pytest + uv run python3 -m pytest tests/ -v + +Or via tox from Model-Optimizer root: + tox -e py312-launcher +""" + +import os +import sys + +import pytest + + +@pytest.fixture(autouse=True) +def add_launcher_to_path(): + """Add the launcher directory to sys.path so core.py and slurm_config.py can be imported.""" + launcher_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if launcher_dir not in sys.path: + sys.path.insert(0, launcher_dir) + yield + if launcher_dir in sys.path: + sys.path.remove(launcher_dir) + + +@pytest.fixture +def tmp_yaml(tmp_path): + """Helper to write a YAML file and return its path.""" + + def _write(content, name="test.yaml"): + p = tmp_path / name + p.write_text(content) + return str(p) + + return _write diff --git a/launcher/tests/test_core.py b/launcher/tests/test_core.py new file mode 100644 index 000000000..6c7e8f043 --- /dev/null +++ b/launcher/tests/test_core.py @@ -0,0 +1,244 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D102 +"""Tests for launcher/core.py — shared dataclasses, factory registry, and utilities. + +Coverage: + - SandboxTask: dataclass fields and defaults, skip flag + - SandboxPipeline: task slot collection, task_configs resolution, global_vars interpolation + - Factory registry: register_factory, lookup in create_task_from_yaml + - set_slurm_config_type: patches SandboxTask annotation + - get_default_env: returns correct env dicts for a given experiment title + - report_versions: runs without error on a git repo +""" + +import os + + +class TestSandboxTask: + """Tests for the SandboxTask dataclass.""" + + def test_defaults(self): + from core import SandboxTask + + task = SandboxTask() + assert task.script is None + assert task.slurm_config is None + assert task.args is None + assert task.environment is None + assert task.skip is False + + def test_with_values(self): + from core import SandboxTask + + task = SandboxTask( + script="test.sh", + args=["--foo", "bar"], + environment=[{"KEY": "val"}], + skip=True, + ) + assert task.script == "test.sh" + assert task.args == ["--foo", "bar"] + assert task.environment == [{"KEY": "val"}] + assert task.skip is True + + +class TestSandboxPipeline: + """Tests for SandboxPipeline task collection and global_vars interpolation.""" + + def test_task_slots_collected(self): + from core import SandboxPipeline, SandboxTask0, SandboxTask1 + + t0 = SandboxTask0(script="a.sh") + t1 = SandboxTask1(script="b.sh") + pipeline = SandboxPipeline(task_0=t0, task_1=t1) + assert len(pipeline.tasks) == 2 + assert pipeline.tasks[0].script == "a.sh" + assert pipeline.tasks[1].script == "b.sh" + + def test_empty_pipeline(self): + from core import SandboxPipeline + + pipeline = SandboxPipeline() + assert pipeline.tasks == [] + + def test_global_vars_interpolation_in_environment(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + environment=[{"MODEL": "<<global_vars.hf_model>>"}], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/hf-local/Qwen/Qwen3-8B"), + ) + assert pipeline.tasks[0].environment == [{"MODEL": "/hf-local/Qwen/Qwen3-8B"}] + + def test_global_vars_interpolation_in_args(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["--model", "<<global_vars.hf_model>>"], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/models/llama"), + ) + assert pipeline.tasks[0].args == ["--model", "/models/llama"] + + def test_global_vars_unresolved_passthrough(self): + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["<<global_vars.nonexistent>>"], + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/models/llama"), + ) + # Unresolved references are left as-is + assert pipeline.tasks[0].args == ["<<global_vars.nonexistent>>"] + + def test_skip_and_allow_to_fail(self): + from core import SandboxPipeline + + pipeline = SandboxPipeline(skip=True, allow_to_fail=True, note="test note") + assert pipeline.skip is True + assert pipeline.allow_to_fail is True + assert pipeline.note == "test note" + + +class TestFactoryRegistry: + """Tests for register_factory and its use in create_task_from_yaml.""" + + def test_register_and_lookup(self, tmp_yaml): + from core import _FACTORY_REGISTRY, register_factory + + # Register a mock factory + def mock_factory(nodes=1, **kwargs): + return {"nodes": nodes, "factory": "mock"} + + register_factory("mock_factory", mock_factory) + assert "mock_factory" in _FACTORY_REGISTRY + assert _FACTORY_REGISTRY["mock_factory"] is mock_factory + + def test_create_task_from_yaml_uses_registry(self, tmp_yaml): + from core import create_task_from_yaml, register_factory + + def test_factory(nodes=1): + return {"nodes": nodes} + + register_factory("test_factory", test_factory) + + yaml_content = """ +script: test.sh +args: + - --flag +slurm_config: + _factory_: "test_factory" + nodes: 2 +""" + path = tmp_yaml(yaml_content) + task = create_task_from_yaml(path, factory_lookup={"test_factory": test_factory}) + assert task.script == "test.sh" + assert task.args == ["--flag"] + assert task.slurm_config == {"nodes": 2} + + def test_task_configs_resolved_via_registry(self, tmp_yaml): + from core import SandboxPipeline, register_factory + + def dummy_factory(nodes=1): + return {"nodes": nodes} + + register_factory("dummy_factory", dummy_factory) + + task_yaml = tmp_yaml( + """ +script: hello.sh +slurm_config: + _factory_: "dummy_factory" + nodes: 3 +""", + name="task.yaml", + ) + pipeline = SandboxPipeline(task_configs=[task_yaml]) + assert len(pipeline.tasks) == 1 + assert pipeline.tasks[0].script == "hello.sh" + assert pipeline.tasks[0].slurm_config == {"nodes": 3} + + +class TestSetSlurmConfigType: + """Tests for set_slurm_config_type annotation patching.""" + + def test_patches_annotation(self): + from dataclasses import dataclass + + from core import SandboxTask, set_slurm_config_type + + @dataclass + class MockSlurmConfig: + host: str = "test" + + set_slurm_config_type(MockSlurmConfig) + assert SandboxTask.__annotations__["slurm_config"] is MockSlurmConfig + assert SandboxTask.__dataclass_fields__["slurm_config"].type is MockSlurmConfig + + +class TestGetDefaultEnv: + """Tests for get_default_env utility.""" + + def test_default_title(self): + from core import get_default_env + + slurm_env, local_env = get_default_env() + assert slurm_env["TRITON_CACHE_DIR"] == "/cicd/triton-cache" + assert slurm_env["HF_HOME"] == "/cicd/hf-cache" + assert slurm_env["MLM_SKIP_INSTALL"] == "1" + assert "LAUNCH_SCRIPT" in slurm_env + assert local_env["TRITON_CACHE_DIR"] == "/cicd/triton-cache" + assert "LAUNCH_SCRIPT" not in local_env + + def test_custom_title(self): + from core import get_default_env + + slurm_env, local_env = get_default_env("modelopt") + assert slurm_env["TRITON_CACHE_DIR"] == "/modelopt/triton-cache" + assert slurm_env["HF_HOME"] == "/modelopt/hf-cache" + assert local_env["HF_HOME"] == "/modelopt/hf-cache" + + +class TestReportVersions: + """Tests for report_versions git info utility.""" + + def test_runs_on_repo(self, capsys): + from core import report_versions + + # Should not raise — runs git on the current repo + report_versions(os.getcwd()) + captured = capsys.readouterr() + assert "Version Report" in captured.out + + def test_runs_on_nonexistent_dir(self, capsys): + from core import report_versions + + # Should handle gracefully — "unknown" for non-git dirs + report_versions("/tmp/nonexistent_dir_12345") + captured = capsys.readouterr() + assert "Version Report" in captured.out + assert "unknown" in captured.out diff --git a/launcher/tests/test_core_extended.py b/launcher/tests/test_core_extended.py new file mode 100644 index 000000000..9d4ba5604 --- /dev/null +++ b/launcher/tests/test_core_extended.py @@ -0,0 +1,353 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D102 +"""Extended tests for launcher/core.py — edge cases and remaining coverage gaps. + +Coverage: + - create_task_from_yaml: error cases (missing factory, bad YAML) + - SandboxPipeline: dict environment (not list), task_configs with registry fallback + - _git_info: direct tests for success and failure + - run_jobs: environment merging (list vs dict), test_level filtering, pipeline skip, + detach flag, version report +""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + + +class TestCreateTaskFromYamlErrors: + """Error handling in create_task_from_yaml.""" + + def test_missing_factory_raises(self, tmp_yaml): + from core import create_task_from_yaml + + yaml_content = """ +script: test.sh +slurm_config: + _factory_: "nonexistent_factory" + nodes: 1 +""" + path = tmp_yaml(yaml_content) + with pytest.raises(KeyError): + create_task_from_yaml(path, factory_lookup={}) + + def test_missing_slurm_config_raises(self, tmp_yaml): + from core import create_task_from_yaml + + yaml_content = """ +script: test.sh +""" + path = tmp_yaml(yaml_content) + with pytest.raises((KeyError, TypeError)): + create_task_from_yaml(path, factory_lookup={}) + + def test_environment_preserved(self, tmp_yaml): + from core import create_task_from_yaml + + def factory(nodes=1): + return {"nodes": nodes} + + yaml_content = """ +script: test.sh +environment: + - KEY1: val1 + - KEY2: val2 +slurm_config: + _factory_: "f" + nodes: 1 +""" + path = tmp_yaml(yaml_content) + task = create_task_from_yaml(path, factory_lookup={"f": factory}) + assert task.environment == [{"KEY1": "val1"}, {"KEY2": "val2"}] + + +class TestSandboxPipelineExtended: + """Extended SandboxPipeline tests.""" + + def test_dict_environment_interpolation(self): + """Global vars resolve in dict-format environment (not list).""" + from core import GlobalVariables, SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + environment={"MODEL": "<<global_vars.hf_model>>", "STATIC": "value"}, + ) + pipeline = SandboxPipeline( + task_0=t0, + global_vars=GlobalVariables(hf_model="/hf-local/model"), + ) + assert pipeline.tasks[0].environment == { + "MODEL": "/hf-local/model", + "STATIC": "value", + } + + def test_tasks_list_directly(self): + """Pipeline can receive tasks as a list directly.""" + from core import SandboxPipeline, SandboxTask + + tasks = [ + SandboxTask(script="a.sh"), + SandboxTask(script="b.sh"), + SandboxTask(script="c.sh"), + ] + pipeline = SandboxPipeline(tasks=tasks) + assert len(pipeline.tasks) == 3 + assert pipeline.tasks[2].script == "c.sh" + + def test_no_global_vars_no_error(self): + """Pipeline without global_vars doesn't crash on interpolation.""" + from core import SandboxPipeline, SandboxTask0 + + t0 = SandboxTask0( + script="test.sh", + args=["<<global_vars.hf_model>>"], + ) + pipeline = SandboxPipeline(task_0=t0) + # No interpolation happens — args kept as-is + assert pipeline.tasks[0].args == ["<<global_vars.hf_model>>"] + + +class TestGitInfo: + """Direct tests for _git_info helper.""" + + def test_valid_git_repo(self): + from core import _git_info + + commit, branch = _git_info(os.getcwd()) + assert commit != "unknown" + assert branch != "unknown" + assert len(commit) >= 7 # short hash + + def test_nonexistent_directory(self): + from core import _git_info + + commit, branch = _git_info("/tmp/nonexistent_xyz_12345") + assert commit == "unknown" + assert branch == "unknown" + + def test_non_git_directory(self): + from core import _git_info + + # Use /tmp which is outside any git repo + commit, branch = _git_info("/tmp") + # /tmp may or may not be inside a git worktree depending on the system + # Just verify it returns strings without crashing + assert isinstance(commit, str) + assert isinstance(branch, str) + + +class TestRunJobsExtended: + """Extended run_jobs tests for env merging, test_level, and detach.""" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_environment_list_merged_to_env(self, mock_docker, mock_exp, tmp_path): + """List-of-dicts environment is merged into task_env.""" + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_env" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0( + script="test.sh", + slurm_config=MagicMock(), + environment=[{"A": "1"}, {"B": "2"}], + ) + pipeline = SandboxPipeline(task_0=t0) + + with patch("core.run.Script") as mock_script: + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + # Script called with merged env + call_kwargs = mock_script.call_args[1] + assert "A" in call_kwargs["env"] + assert "B" in call_kwargs["env"] + assert call_kwargs["env"]["A"] == "1" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_none_env_values_converted_to_empty_string(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_none" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0( + script="test.sh", + slurm_config=MagicMock(), + environment=[{"KEY": None}], + ) + pipeline = SandboxPipeline(task_0=t0) + + with patch("core.run.Script") as mock_script: + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + env = mock_script.call_args[1]["env"] + assert env["KEY"] == "" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_test_level_filters_pipeline(self, mock_docker, mock_exp, tmp_path): + """Pipelines with test_level > current are skipped.""" + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_lvl" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, test_level=2) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + test_level=0, # lower than pipeline's test_level=2 + base_dir=str(tmp_path), + ) + + # Experiment should not be created for skipped pipelines + mock_exp.assert_not_called() + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_skipped_pipeline_not_run(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, skip=True) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + + mock_exp.assert_not_called() + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_detach_flag_passed_to_experiment(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_detach" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + detach=True, + base_dir=str(tmp_path), + ) + + mock_exp_inst.run.assert_called_once_with(detach=True) + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_version_report_called(self, mock_docker, mock_exp, tmp_path, capsys): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_inst = MagicMock() + mock_exp_inst._id = "exp_ver" + mock_exp_inst.__enter__ = MagicMock(return_value=mock_exp_inst) + mock_exp_inst.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_inst + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env() + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + + run_jobs( + job_table={"job": pipeline}, + hf_local="/tmp/hf", + user="u", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + base_dir=str(tmp_path), + ) + + captured = capsys.readouterr() + assert "Version Report" in captured.out diff --git a/launcher/tests/test_docker_execution.py b/launcher/tests/test_docker_execution.py new file mode 100644 index 000000000..6d3fa0fa7 --- /dev/null +++ b/launcher/tests/test_docker_execution.py @@ -0,0 +1,332 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D102 +"""Tests for Docker execution path — verifies build_docker_executor and run_jobs with mocked Docker. + +Coverage: + - build_docker_executor: container mounts, scratch dir creation, modelopt mount + - run_jobs with hf_local: Docker path selected, env vars applied, metadata written + - --yaml format end-to-end: YAML parsed, pipeline constructed, executor built +""" + +import json +import os +from unittest.mock import MagicMock, patch + + +class TestBuildDockerExecutor: + """Tests for build_docker_executor mount and directory setup.""" + + def test_scratch_dir_created(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + build_docker_executor( + hf_local="/tmp/hf-local", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + scratch_dir = os.path.join(job_dir, "cicd", "exp_123", "task_0") + assert os.path.isdir(scratch_dir) + + def test_hf_local_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/my/hf-local", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/my/hf-local:/hf-local" in v for v in volumes) + + def test_scratchspace_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_456", + job_dir=job_dir, + task_name="job_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + expected_scratch = os.path.join(job_dir, "cicd", "exp_456", "job_0") + assert any(f"{expected_scratch}:/scratchspace" in v for v in volumes) + + def test_modelopt_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_789", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/custom/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/custom/modelopt:/opt/modelopt" in v for v in volumes) + + def test_experiment_title_mount(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=False, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=None, + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="modelopt", + ) + volumes = executor.volumes + exp_title_path = os.path.join(job_dir, "modelopt") + assert any(f"{exp_title_path}:/modelopt" in v for v in volumes) + + def test_local_slurm_config_mounts_preserved(self, tmp_path): + from core import build_docker_executor + + job_dir = str(tmp_path / "experiments") + executor = build_docker_executor( + hf_local="/tmp/hf", + slurm_config=MagicMock( + local=True, + container="test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=["/data:/data", "/models:/models"], + srun_args=None, + array=None, + ), + experiment_id="exp_123", + job_dir=job_dir, + task_name="task_0", + packager=MagicMock(), + modelopt_src_path="/tmp/modelopt", + experiment_title="cicd", + ) + volumes = executor.volumes + assert any("/data:/data" in v for v in volumes) + assert any("/models:/models" in v for v in volumes) + + +class TestRunJobsDockerPath: + """Tests for run_jobs selecting Docker path when hf_local is set.""" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_docker_executor_called_with_hf_local(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_001" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0( + script="echo hello", + slurm_config=MagicMock(), + ) + pipeline = SandboxPipeline(task_0=t0) + job_table = {"test_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf-local", + user="testuser", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + mock_docker.assert_called_once() + call_kwargs = mock_docker.call_args + assert call_kwargs[0][0] == "/tmp/hf-local" # hf_local + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_metadata_written(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_meta" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="test.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, allow_to_fail=True, note="test note") + job_table = {"meta_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf", + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + metadata_path = os.path.join("experiments", "cicd", "test_exp_meta", "metadata.json") + assert os.path.exists(metadata_path) + with open(metadata_path) as f: + meta = json.load(f) + assert meta["experiment_id"] == "test_exp_meta" + assert meta["job_name"] == "meta_job" + assert meta["allow_to_fail"] is True + assert meta["note"] == "test note" + + @patch("core.run.Experiment") + @patch("core.build_docker_executor") + def test_skipped_task_not_submitted(self, mock_docker, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, SandboxTask1, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_skip" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_docker.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="run.sh", slurm_config=MagicMock(), skip=True) + t1 = SandboxTask1(script="eval.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0, task_1=t1) + job_table = {"skip_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local="/tmp/hf", + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + # Only task_1 should be submitted (task_0 is skipped) + assert mock_docker.call_count == 1 + + @patch("core.run.Experiment") + @patch("core.build_slurm_executor") + def test_slurm_executor_called_without_hf_local(self, mock_slurm, mock_exp, tmp_path): + from core import SandboxPipeline, SandboxTask0, get_default_env, run_jobs + + mock_exp_instance = MagicMock() + mock_exp_instance._id = "test_exp_slurm" + mock_exp_instance.__enter__ = MagicMock(return_value=mock_exp_instance) + mock_exp_instance.__exit__ = MagicMock(return_value=False) + mock_exp.return_value = mock_exp_instance + + mock_slurm.return_value = MagicMock() + + slurm_env, local_env = get_default_env("cicd") + + t0 = SandboxTask0(script="train.sh", slurm_config=MagicMock()) + pipeline = SandboxPipeline(task_0=t0) + job_table = {"slurm_job": pipeline} + + run_jobs( + job_table=job_table, + hf_local=None, # No hf_local → Slurm path + user="user", + identity=None, + job_dir=str(tmp_path), + packager=MagicMock(), + default_slurm_env=slurm_env, + default_local_env=local_env, + experiment_title="cicd", + base_dir=str(tmp_path), + ) + + mock_slurm.assert_called_once() diff --git a/launcher/tests/test_docker_launch.py b/launcher/tests/test_docker_launch.py new file mode 100644 index 000000000..625d28b08 --- /dev/null +++ b/launcher/tests/test_docker_launch.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration test for Docker container launch via run_jobs. + +Requires Docker to be installed and running. Uses python:3.12-slim +(lightweight, no GPU needed) to run a trivial script. + +Run with: pytest -s (stdin capture must be disabled for invoke/fabric) +""" + +import os +import shutil +import subprocess + +import pytest + +docker_available = shutil.which("docker") is not None + + +@pytest.mark.skipif(not docker_available, reason="Docker not available") +class TestDockerLaunch: + """End-to-end Docker launch test using subprocess to avoid pytest stdin capture issues.""" + + def test_echo_script_via_launch(self, tmp_path): + """Launch a Docker container via launch.py subprocess that runs 'echo hello'.""" + # Create a trivial script + script_dir = tmp_path / "scripts" + script_dir.mkdir() + script = script_dir / "hello.sh" + script.write_text("#!/bin/bash\necho 'HELLO_FROM_DOCKER'\n") + script.chmod(0o755) + + # Create a YAML config + yaml_content = """ +job_name: test_hello +pipeline: + task_0: + script: scripts/hello.sh + slurm_config: + _factory_: "slurm_factory" + container: python:3.12-slim +""" + yaml_path = tmp_path / "test.yaml" + yaml_path.write_text(yaml_content) + + # Run launch.py as a subprocess (avoids pytest stdin capture issues) + launcher_dir = os.path.join(os.path.dirname(__file__), "..") + launcher_dir = os.path.abspath(launcher_dir) + + result = subprocess.run( + [ + "uv", + "run", + "launch.py", + "--yaml", + str(yaml_path), + f"hf_local={tmp_path}", + "--yes", + ], + cwd=launcher_dir, + capture_output=True, + text=True, + timeout=300, + ) + + # Check output + assert "Version Report" in result.stdout + assert "Launching" in result.stdout or "Entering Experiment" in result.stdout + + def test_failing_script_via_launch(self, tmp_path): + """Launch a Docker container that exits 1 — launch.py should not crash.""" + script_dir = tmp_path / "scripts" + script_dir.mkdir() + script = script_dir / "fail.sh" + script.write_text("#!/bin/bash\necho 'FAILING'\nexit 1\n") + script.chmod(0o755) + + yaml_content = """ +job_name: test_fail +pipeline: + task_0: + script: scripts/fail.sh + slurm_config: + _factory_: "slurm_factory" + container: python:3.12-slim +""" + yaml_path = tmp_path / "fail_test.yaml" + yaml_path.write_text(yaml_content) + + launcher_dir = os.path.join(os.path.dirname(__file__), "..") + launcher_dir = os.path.abspath(launcher_dir) + + result = subprocess.run( + [ + "uv", + "run", + "launch.py", + "--yaml", + str(yaml_path), + f"hf_local={tmp_path}", + "--yes", + ], + cwd=launcher_dir, + capture_output=True, + text=True, + timeout=300, + ) + + # launch.py should complete (exit 0) even if the job fails + # The job failure is reported in stdout + assert "Version Report" in result.stdout diff --git a/launcher/tests/test_slurm_config.py b/launcher/tests/test_slurm_config.py new file mode 100644 index 000000000..b23c46c24 --- /dev/null +++ b/launcher/tests/test_slurm_config.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D102 +"""Tests for launcher/slurm_config.py — SlurmConfig dataclass and factory. + +Coverage: + - SlurmConfig: default values, field types + - slurm_factory: default behavior, env var overrides (SLURM_HOST, SLURM_ACCOUNT, + SLURM_HF_LOCAL), return type +""" + + +class TestSlurmConfig: + """Tests for the SlurmConfig dataclass.""" + + def test_defaults(self): + from slurm_config import SlurmConfig + + cfg = SlurmConfig() + assert cfg.host is None + assert cfg.port == 22 + assert cfg.account is None + assert cfg.partition == "batch" + assert cfg.container is None + assert cfg.nodes == 1 + assert cfg.ntasks_per_node == 1 + assert cfg.gpus_per_node == 1 + assert cfg.local is False + assert cfg.container_mounts is None + assert cfg.srun_args is None + assert cfg.array is None + + def test_custom_values(self): + from slurm_config import SlurmConfig + + cfg = SlurmConfig( + host="login.example.com", + account="my_account", + nodes=4, + gpus_per_node=8, + container="nvcr.io/nvidia/pytorch:24.01-py3", + container_mounts=["/data:/data"], + srun_args=["--no-container-mount-home"], + ) + assert cfg.host == "login.example.com" + assert cfg.account == "my_account" + assert cfg.nodes == 4 + assert cfg.gpus_per_node == 8 + assert cfg.container_mounts == ["/data:/data"] + + +class TestSlurmFactory: + """Tests for the slurm_factory function.""" + + def test_default_returns_slurm_config(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + # slurm_factory with @run.autoconvert returns a nemo-run Config wrapper + assert "SlurmConfig" in repr(cfg) + + def test_default_container(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + assert "tensorrt-llm" in cfg.container + + def test_default_srun_args(self): + from slurm_config import slurm_factory + + cfg = slurm_factory() + assert cfg.srun_args == ["--no-container-mount-home"] + + def test_default_container_mounts_from_env(self, monkeypatch): + monkeypatch.setenv("SLURM_HF_LOCAL", "/custom/hf-local") + # Need to re-import to pick up the env var in the default + # The factory reads SLURM_HF_LOCAL at call time via the default arg + import importlib + + import slurm_config + + importlib.reload(slurm_config) + cfg = slurm_config.slurm_factory() + assert any("/custom/hf-local:/hf-local" in m for m in cfg.container_mounts) + + def test_override_nodes(self): + from slurm_config import slurm_factory + + cfg = slurm_factory(nodes=8) + assert cfg.nodes == 8 + + def test_override_partition(self): + from slurm_config import slurm_factory + + cfg = slurm_factory(partition="gpu") + assert cfg.partition == "gpu" + + def test_env_var_host(self, monkeypatch): + monkeypatch.setenv("SLURM_HOST", "test-host.example.com") + import importlib + + import slurm_config + + importlib.reload(slurm_config) + cfg = slurm_config.slurm_factory() + assert cfg.host == "test-host.example.com" diff --git a/launcher/tests/test_slurm_executor.py b/launcher/tests/test_slurm_executor.py new file mode 100644 index 000000000..d7ac7827f --- /dev/null +++ b/launcher/tests/test_slurm_executor.py @@ -0,0 +1,231 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa: D102 +"""Tests for build_slurm_executor — container mounts, scratch paths, executor params. + +Note: actual SSH tunnel and sbatch submission are not tested (require live infra). +We mock run.SSHTunnel and run.SlurmExecutor to verify the arguments passed. +""" + +from unittest.mock import MagicMock, patch + + +class TestBuildSlurmExecutor: + """Tests for build_slurm_executor mount construction and executor params.""" + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_scratch_and_modelopt_mounts(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="test-host", + port=22, + account="test_account", + partition="batch", + container="nvcr.io/test:latest", + modelopt_install_path="/opt/modelopt", + container_mounts=["/hf-local:/hf-local"], + srun_args=["--no-container-mount-home"], + nodes=1, + ntasks_per_node=4, + gpus_per_node=4, + array=None, + ) + + build_slurm_executor( + user="testuser", + identity=None, + slurm_config=slurm_config, + experiment_id="exp_001", + job_dir="/lustre/experiments", + task_name="job_0", + packager=MagicMock(), + experiment_title="cicd", + ) + + # Check SlurmExecutor was called + mock_executor.assert_called_once() + call_kwargs = mock_executor.call_args[1] + + # Verify container mounts include scratch, modelopt, and experiment title + mounts = call_kwargs["container_mounts"] + assert any("/scratchspace" in m for m in mounts) + assert any("/opt/modelopt" in m for m in mounts) + assert any("/cicd" in m for m in mounts) + # Original mount preserved + assert any("/hf-local:/hf-local" in m for m in mounts) + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_scratch_path_uses_experiment_title(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="host", + port=22, + account="acct", + partition="batch", + container="img", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=[], + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="exp_xyz", + job_dir="/data", + task_name="task_0", + packager=MagicMock(), + experiment_title="modelopt", + ) + + mounts = mock_executor.call_args[1]["container_mounts"] + assert any("/data/modelopt/exp_xyz:/scratchspace" in m for m in mounts) + assert any("/data/modelopt:/modelopt" in m for m in mounts) + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_tunnel_created_with_correct_params(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="login.cluster.com", + port=30022, + account="acct", + partition="batch", + container="img", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=[], + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="myuser", + identity="/home/.ssh/id_rsa", + slurm_config=slurm_config, + experiment_id="exp_1", + job_dir="/job", + task_name="t0", + packager=MagicMock(), + ) + + mock_tunnel.assert_called_once() + tunnel_kwargs = mock_tunnel.call_args[1] + assert tunnel_kwargs["host"] == "login.cluster.com" + assert tunnel_kwargs["user"] == "myuser" + assert tunnel_kwargs["port"] == 30022 + assert tunnel_kwargs["identity"] == "/home/.ssh/id_rsa" + assert tunnel_kwargs["job_dir"] == "/job" + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_executor_params(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="h", + port=22, + account="my_acct", + partition="gpu", + container="nvcr.io/img:v1", + modelopt_install_path="/opt/mo", + container_mounts=[], + srun_args=["--mpi=pmix"], + nodes=2, + ntasks_per_node=8, + gpus_per_node=8, + array="0-3", + ) + + packager = MagicMock() + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="e1", + job_dir="/j", + task_name="t0", + packager=packager, + ) + + kw = mock_executor.call_args[1] + assert kw["account"] == "my_acct" + assert kw["partition"] == "gpu" + assert kw["nodes"] == 2 + assert kw["ntasks_per_node"] == 8 + assert kw["gpus_per_node"] == 8 + assert kw["container_image"] == "nvcr.io/img:v1" + assert kw["srun_args"] == ["--mpi=pmix"] + assert kw["array"] == "0-3" + assert kw["packager"] is packager + assert kw["time"] == "04:00:00" + assert kw["retries"] == 0 + + @patch("core.run.SlurmExecutor") + @patch("core.run.SSHTunnel") + def test_none_container_mounts_handled(self, mock_tunnel, mock_executor): + from core import build_slurm_executor + + mock_tunnel.return_value = MagicMock() + + slurm_config = MagicMock( + host="h", + port=22, + account="a", + partition="b", + container="c", + modelopt_install_path="/m", + container_mounts=None, + srun_args=None, + nodes=1, + ntasks_per_node=1, + gpus_per_node=1, + array=None, + ) + + build_slurm_executor( + user="u", + identity=None, + slurm_config=slurm_config, + experiment_id="e", + job_dir="/j", + task_name="t", + packager=MagicMock(), + ) + + # Should not crash; mounts should still include scratch + modelopt + title + mounts = mock_executor.call_args[1]["container_mounts"] + assert len(mounts) >= 3 diff --git a/launcher/tests/test_yaml_formats.py b/launcher/tests/test_yaml_formats.py new file mode 100644 index 000000000..981c32216 --- /dev/null +++ b/launcher/tests/test_yaml_formats.py @@ -0,0 +1,192 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for YAML config parsing — verifies that different YAML formats produce correct dataclasses. + +Coverage: + - --yaml format: top-level job_name + pipeline with task_0, environment, slurm_config + - pipeline=@ format: bare SandboxPipeline without job_name wrapper + - task_configs: list of YAML paths resolved via factory registry + - Environment formats: list-of-dicts and flat dict both parsed correctly + - Global vars: <<global_vars.X>> resolved in both args and environment +""" + +import yaml + + +class TestYamlFormatParsing: + """Tests that YAML content parses into correct dataclass structures.""" + + def test_yaml_format_with_job_name(self, tmp_yaml): + """The --yaml format has job_name and pipeline as top-level keys.""" + content = """ +job_name: test_job +pipeline: + skip: false + allow_to_fail: true + note: "test note" + task_0: + script: test.sh + args: + - --flag + environment: + - KEY: value +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + assert data["job_name"] == "test_job" + assert data["pipeline"]["skip"] is False + assert data["pipeline"]["allow_to_fail"] is True + assert data["pipeline"]["note"] == "test note" + assert data["pipeline"]["task_0"]["script"] == "test.sh" + assert data["pipeline"]["task_0"]["args"] == ["--flag"] + assert data["pipeline"]["task_0"]["environment"] == [{"KEY": "value"}] + + def test_bare_pipeline_format(self, tmp_yaml): + """The pipeline=@ format is a bare SandboxPipeline without wrapper.""" + content = """ +task_0: + script: a.sh + args: + - --foo +task_1: + script: b.sh +allow_to_fail: false +skip: false +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + # Verify the YAML parses into valid SandboxPipeline kwargs + # (nemo-run does this via its CLI parser; we just verify the structure) + assert "task_0" in data + assert "task_1" in data + assert data["task_0"]["script"] == "a.sh" + assert data["task_1"]["script"] == "b.sh" + + def test_task_configs_format(self, tmp_yaml): + """task_configs lists YAML files that are resolved into tasks.""" + from core import SandboxPipeline, register_factory + + def local_factory(nodes=1): + return {"nodes": nodes} + + register_factory("local_factory", local_factory) + + task_path = tmp_yaml( + """ +script: worker.sh +args: + - --batch-size 32 +slurm_config: + _factory_: "local_factory" + nodes: 2 +""", + name="worker.yaml", + ) + + pipeline = SandboxPipeline(task_configs=[task_path]) + assert len(pipeline.tasks) == 1 + assert pipeline.tasks[0].script == "worker.sh" + assert pipeline.tasks[0].args == ["--batch-size 32"] + assert pipeline.tasks[0].slurm_config == {"nodes": 2} + + def test_environment_list_of_dicts(self): + """Environment as list-of-single-key-dicts (nemo-run format).""" + from core import SandboxTask + + task = SandboxTask( + script="test.sh", + environment=[{"A": "1"}, {"B": "2"}, {"C": "3"}], + ) + assert len(task.environment) == 3 + assert task.environment[0] == {"A": "1"} + + def test_global_vars_across_multiple_tasks(self, tmp_yaml): + """Global vars resolve in both task_0 and task_1.""" + from core import GlobalVariables, SandboxPipeline, SandboxTask0, SandboxTask1 + + t0 = SandboxTask0( + script="quantize.sh", + args=["--model", "<<global_vars.hf_model>>"], + environment=[{"HF_MODEL": "<<global_vars.hf_model>>"}], + ) + t1 = SandboxTask1( + script="eval.sh", + environment=[{"HF_MODEL": "<<global_vars.hf_model>>"}], + ) + pipeline = SandboxPipeline( + task_0=t0, + task_1=t1, + global_vars=GlobalVariables(hf_model="/hf-local/Qwen/Qwen3-8B"), + ) + assert pipeline.tasks[0].args == ["--model", "/hf-local/Qwen/Qwen3-8B"] + assert pipeline.tasks[0].environment == [{"HF_MODEL": "/hf-local/Qwen/Qwen3-8B"}] + assert pipeline.tasks[1].environment == [{"HF_MODEL": "/hf-local/Qwen/Qwen3-8B"}] + + +class TestTestYamlFormat: + """Tests for the test YAML format used by run_test_yaml.sh.""" + + def test_target_with_overrides(self, tmp_yaml): + """Test YAML entries have _target_ and override fields.""" + content = """ +- _target_: path/to/config.yaml + pipeline: + allow_to_fail: true + skip: false + note: "known issue" +- _target_: path/to/other.yaml + pipeline: + allow_to_fail: false +""" + path = tmp_yaml(content) + with open(path) as f: + data = yaml.safe_load(f) + + assert isinstance(data, list) + assert len(data) == 2 + assert data[0]["_target_"] == "path/to/config.yaml" + assert data[0]["pipeline"]["allow_to_fail"] is True + assert data[0]["pipeline"]["note"] == "known issue" + assert data[1]["_target_"] == "path/to/other.yaml" + assert data[1]["pipeline"]["allow_to_fail"] is False + + def test_flatten_overrides(self): + """Nested overrides flatten to dot-notation for CLI args.""" + entry = { + "pipeline": { + "allow_to_fail": True, + "skip": False, + } + } + + # Simulate the flatten logic from run_test_yaml.sh + overrides = [] + + def flatten(d, prefix=""): + for k, v in d.items(): + key = f"{prefix}{k}" if prefix else k + if isinstance(v, dict): + flatten(v, f"{key}.") + else: + overrides.append(f"{key}={v}") + + flatten(entry) + assert "pipeline.allow_to_fail=True" in overrides + assert "pipeline.skip=False" in overrides diff --git a/uv.lock b/uv.lock index 5849559ad..0f36f2dbb 100644 --- a/uv.lock +++ b/uv.lock @@ -16,9 +16,6 @@ resolution-markers = [ "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", ] -[manifest] -overrides = [{ name = "torch", marker = "sys_platform == 'never'" }] - [[package]] name = "accelerate" version = "1.13.0" @@ -31,7 +28,7 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } wheels = [ @@ -407,6 +404,19 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/54/27/01d9078a77b9e31b79b9716e66ca4db74f4744c5232bcb3e8769395c4280/cppimport-22.8.2.tar.gz", hash = "sha256:bbb4957102db41bc99ad72c233bce92f9d1fd91be352fc07878c4361033a401f", size = 26635, upload-time = "2022-08-02T16:50:36.872Z" } +[[package]] +name = "cuda-bindings" +version = "12.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cuda-pathfinder", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/d8/b546104b8da3f562c1ff8ab36d130c8fe1dd6a045ced80b4f6ad74f7d4e1/cuda_bindings-12.9.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d3c842c2a4303b2a580fe955018e31aea30278be19795ae05226235268032e5", size = 12148218, upload-time = "2025-10-21T14:51:28.855Z" }, + { url = "https://files.pythonhosted.org/packages/45/e7/b47792cc2d01c7e1d37c32402182524774dadd2d26339bd224e0e913832e/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c912a3d9e6b6651853eed8eed96d6800d69c08e94052c292fec3f282c5a817c9", size = 12210593, upload-time = "2025-10-21T14:51:36.574Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, +] + [[package]] name = "cuda-pathfinder" version = "1.4.1" @@ -478,7 +488,7 @@ dependencies = [ { name = "psutil", marker = "sys_platform != 'win32'" }, { name = "py-cpuinfo", marker = "sys_platform != 'win32'" }, { name = "pydantic", marker = "sys_platform != 'win32'" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch", marker = "sys_platform != 'win32'" }, { name = "tqdm", marker = "sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/00/ad310cf94e0e397c416087e6c4dc782429292206b2b1a3ffbd388ac95a67/deepspeed-0.18.7.tar.gz", hash = "sha256:3763530196f8e7df8fc56d028a8c64409200695213920dc6cf0045d50c884079", size = 1646894, upload-time = "2026-03-05T20:44:56.579Z" } @@ -1106,7 +1116,9 @@ name = "networkx" version = "3.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'win32'", "(python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version < '3.11' and sys_platform == 'darwin')", + "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'win32'", "python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } @@ -1119,10 +1131,14 @@ name = "networkx" version = "3.6.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'win32'", "(python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version >= '3.12' and sys_platform == 'darwin')", "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'win32') or (python_full_version == '3.11.*' and sys_platform == 'darwin')", "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", "python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'", + "python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'win32'", + "python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'win32'", ] sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } wheels = [ @@ -1274,6 +1290,108 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/57/a7/b35835e278c18b85206834b3aa3abe68e77a98769c59233d1f6300284781/numpy-2.4.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:4b42639cdde6d24e732ff823a3fa5b701d8acad89c4142bc1d0bd6dc85200ba5", size = 12504685, upload-time = "2026-03-09T07:58:50.525Z" }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine != 'aarch64' and sys_platform != 'darwin' and sys_platform != 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, +] + [[package]] name = "nvidia-ml-py" version = "13.590.48" @@ -1300,7 +1418,7 @@ dependencies = [ { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "setuptools" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "tqdm" }, ] @@ -1505,6 +1623,38 @@ requires-dist = [ ] provides-extras = ["onnx", "hf", "dev-lint", "dev-docs", "dev-test", "all", "dev"] +[[package]] +name = "nvidia-nccl-cu12" +version = "2.27.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, +] + [[package]] name = "onnx" version = "1.19.1" @@ -1829,7 +1979,7 @@ dependencies = [ { name = "psutil" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "tqdm" }, { name = "transformers" }, ] @@ -2882,7 +3032,7 @@ dependencies = [ { name = "huggingface-hub" }, { name = "pyyaml" }, { name = "safetensors" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "torchvision" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d7/2c/593109822fe735e637382aca6640c1102c19797f7791f1fd1dab2d6c3cb1/timm-1.0.25.tar.gz", hash = "sha256:47f59fc2754725735cc81bb83bcbfce5bec4ebd5d4bb9e69da57daa92fcfa768", size = 2414743, upload-time = "2026-02-23T16:49:00.137Z" } @@ -2961,15 +3111,52 @@ name = "torch" version = "2.10.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "cuda-bindings", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/30/bfebdd8ec77db9a79775121789992d6b3b75ee5494971294d7b4b7c999bc/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2b980edd8d7c0a68c4e951ee1856334a43193f98730d97408fbd148c1a933313", size = 79411457, upload-time = "2026-02-10T21:44:59.189Z" }, + { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" }, + { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/16/ee/efbd56687be60ef9af0c9c0ebe106964c07400eade5b0af8902a1d8cd58c/torch-2.10.0-3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a1ff626b884f8c4e897c4c33782bdacdff842a165fee79817b1dd549fdda1321", size = 915510070, upload-time = "2026-03-11T14:16:39.386Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/7b562f1808d3f65414cd80a4f7d4bb00979d9355616c034c171249e1a303/torch-2.10.0-3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ac5bdcbb074384c66fa160c15b1ead77839e3fe7ed117d667249afce0acabfac", size = 915518691, upload-time = "2026-03-11T14:15:43.147Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" }, + { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" }, + { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" }, + { url = "https://files.pythonhosted.org/packages/76/bb/d820f90e69cda6c8169b32a0c6a3ab7b17bf7990b8f2c680077c24a3c14c/torch-2.10.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:35e407430795c8d3edb07a1d711c41cc1f9eaddc8b2f1cc0a165a6767a8fb73d", size = 79411450, upload-time = "2026-01-21T16:25:30.692Z" }, + { url = "https://files.pythonhosted.org/packages/78/89/f5554b13ebd71e05c0b002f95148033e730d3f7067f67423026cc9c69410/torch-2.10.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3282d9febd1e4e476630a099692b44fdc214ee9bf8ee5377732d9d9dfe5712e4", size = 145992610, upload-time = "2026-01-21T16:25:26.327Z" }, + { url = "https://files.pythonhosted.org/packages/ae/30/a3a2120621bf9c17779b169fc17e3dc29b230c29d0f8222f499f5e159aa8/torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a2f9edd8dbc99f62bc4dfb78af7bf89499bca3d753423ac1b4e06592e467b763", size = 915607863, upload-time = "2026-01-21T16:25:06.696Z" }, + { url = "https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:29b7009dba4b7a1c960260fc8ac85022c784250af43af9fb0ebafc9883782ebd", size = 113723116, upload-time = "2026-01-21T16:25:21.916Z" }, + { url = "https://files.pythonhosted.org/packages/61/d8/15b9d9d3a6b0c01b883787bd056acbe5cc321090d4b216d3ea89a8fcfdf3/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:b7bd80f3477b830dd166c707c5b0b82a898e7b16f59a7d9d42778dd058272e8b", size = 79423461, upload-time = "2026-01-21T16:24:50.266Z" }, + { url = "https://files.pythonhosted.org/packages/cc/af/758e242e9102e9988969b5e621d41f36b8f258bb4a099109b7a4b4b50ea4/torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5fd4117d89ffd47e3dcc71e71a22efac24828ad781c7e46aaaf56bf7f2796acf", size = 145996088, upload-time = "2026-01-21T16:24:44.171Z" }, + { url = "https://files.pythonhosted.org/packages/23/8e/3c74db5e53bff7ed9e34c8123e6a8bfef718b2450c35eefab85bb4a7e270/torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:787124e7db3b379d4f1ed54dd12ae7c741c16a4d29b49c0226a89bea50923ffb", size = 915711952, upload-time = "2026-01-21T16:23:53.503Z" }, + { url = "https://files.pythonhosted.org/packages/6e/01/624c4324ca01f66ae4c7cd1b74eb16fb52596dce66dbe51eff95ef9e7a4c/torch-2.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c66c61f44c5f903046cc696d088e21062644cbe541c7f1c4eaae88b2ad23547", size = 113757972, upload-time = "2026-01-21T16:24:39.516Z" }, + { url = "https://files.pythonhosted.org/packages/c9/5c/dee910b87c4d5c0fcb41b50839ae04df87c1cfc663cf1b5fca7ea565eeaa/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6d3707a61863d1c4d6ebba7be4ca320f42b869ee657e9b2c21c736bf17000294", size = 79498198, upload-time = "2026-01-21T16:24:34.704Z" }, +] [[package]] name = "torch-geometric" @@ -2999,7 +3186,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, { name = "torchvision" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/36/574c0c46e818533b78b3c09505211162918188325ab4165ef11a3f295755/torchprofile-0.0.4.tar.gz", hash = "sha256:96b6da17d752a06b02977e078aea95614893b31d4117dd5dcd081f30ce65611b", size = 4557, upload-time = "2021-06-22T04:58:03.592Z" } @@ -3015,7 +3202,7 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pillow" }, - { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torch" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/50/ae/cbf727421eb73f1cf907fbe5788326a08f111b3f6b6ddca15426b53fec9a/torchvision-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a95c47abb817d4e90ea1a8e57bd0d728e3e6b533b3495ae77d84d883c4d11f56", size = 1874919, upload-time = "2026-01-21T16:27:47.617Z" }, @@ -3100,6 +3287,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, ] +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" }, + { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"