From 5e696514411572ed6cffd659c93d4d799a74da02 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Tue, 2 Jun 2026 22:44:21 +0000 Subject: [PATCH 01/14] multinode streaming; k2.5 example Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/specdec_bench/specdec_bench/utils.py | 4 + examples/speculative_decoding/launch_train.sh | 8 +- modelopt/recipe/config.py | 6 +- modelopt/torch/speculative/config.py | 6 +- .../common/eagle3/train_eagle_streaming.sh | 124 +++++++++++----- tools/launcher/core.py | 3 + .../Kimi-K2.5/hf_dflash_dryrun.yaml | 64 +++++++++ .../Kimi-K2.5/hf_streaming_dflash.yaml | 131 +++++++++++++++++ .../hf_streaming_dflash_multi_node.yaml | 133 ++++++++++++++++++ .../moonshotai/Kimi-K2.5/specdec_bench.yaml | 81 +++++++++++ tools/launcher/slurm_config.py | 7 + 11 files changed, 530 insertions(+), 37 deletions(-) create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py index 9a52d0ceac2..73d1e048c80 100644 --- a/examples/specdec_bench/specdec_bench/utils.py +++ b/examples/specdec_bench/specdec_bench/utils.py @@ -196,6 +196,10 @@ def _checkpoint_provenance(model_dir): def _is_sensitive_key(key): + # Engine configs can carry non-string dict keys (e.g. int layer ids in a + # serving_config); those are never sensitive field *names*, so skip them. + if not isinstance(key, str): + return False klow = key.lower() if klow in _SENSITIVE_KEY_ALLOWLIST: return False diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh index 41d71d14173..fc623930767 100755 --- a/examples/speculative_decoding/launch_train.sh +++ b/examples/speculative_decoding/launch_train.sh @@ -30,12 +30,14 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" CONFIG_FILE="" NUM_NODES=1 HEAD_NODE_IP="" +MACHINE_RANK="" EXTRA_ARGS=() while [ $# -gt 0 ]; do case "$1" in --config*) if [[ "$1" != *=* ]]; then shift; fi; CONFIG_FILE="${1#*=}" ;; --num_nodes*) if [[ "$1" != *=* ]]; then shift; fi; NUM_NODES="${1#*=}" ;; --head_node_ip*) if [[ "$1" != *=* ]]; then shift; fi; HEAD_NODE_IP="${1#*=}" ;; + --machine_rank*) if [[ "$1" != *=* ]]; then shift; fi; MACHINE_RANK="${1#*=}" ;; *) EXTRA_ARGS+=("$1") ;; esac shift @@ -59,9 +61,13 @@ fi # Multi-node routing args (accelerate only; training config comes from the YAML) MULTI_NODE_ARGS="" if [[ "$NUM_NODES" != "1" ]]; then + # machine_rank: caller may pass --machine_rank explicitly (needed when the + # SLURM allocation reserves node 0 for something else, e.g. the streaming + # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank). + # Default to $SLURM_PROCID for the all-nodes-are-trainers case. MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \ --num_machines $NUM_NODES \ - --machine_rank $SLURM_PROCID \ + --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \ --rdzv_backend c10d \ --main_process_ip $HEAD_NODE_IP \ --main_process_port 29500" diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py index 4bf91b52d6f..97d93bbafc6 100644 --- a/modelopt/recipe/config.py +++ b/modelopt/recipe/config.py @@ -178,7 +178,11 @@ class ModelOptDFlashRecipe(ModelOptSpeculativeRecipeBase): @model_validator(mode="after") def _derive_dflash_offline(self) -> ModelOptDFlashRecipe: - self.dflash.dflash_offline = self.data.offline_data_path is not None + # offline (dumped .pt) and streaming (hidden states over HTTP from a vLLM + # serve) both feed pre-computed base hidden states to the DFlash module, so + # both set dflash_offline. Only fully-online training runs the base model. + # Mirrors ModelOptEagleRecipe._derive_eagle_offline. + self.dflash.dflash_offline = self.data.mode != "online" return self diff --git a/modelopt/torch/speculative/config.py b/modelopt/torch/speculative/config.py index 6b2c9396ce7..23ad200b6e7 100644 --- a/modelopt/torch/speculative/config.py +++ b/modelopt/torch/speculative/config.py @@ -68,8 +68,10 @@ class DFlashConfig(ModeloptBaseConfig): dflash_offline: bool = ModeloptField( default=False, description=( - "Whether to use detached DFlash (offline training from pre-computed hidden states). " - "Derived by ModelOptDFlashRecipe from data.offline_data_path; not user-configurable." + "Whether the DFlash module consumes pre-computed hidden states (offline from " + "dumped .pt files, or streaming over HTTP from a vLLM serve) instead of running " + "the base model. Derived by ModelOptDFlashRecipe from data.mode (True unless " + "online); not user-configurable." ), ) diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index 158bd7a0cf6..4a8dc8bbacf 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -24,12 +24,19 @@ # $SLURM_NODEID: # nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of # the local GPUs (original single-node behavior). -# nodes >= 2 -> split across nodes: node 0 runs vllm serve on all its GPUs, -# node 1 runs the trainer on all its GPUs. The two roles -# rendezvous through the shared /scratchspace mount (node 0 -# publishes its address; node 1 signals completion). For large -# models whose serve needs a whole node (e.g. Kimi-K2.5 TP=8), -# allocate exactly 2 nodes. +# nodes == 2 -> split: node 0 runs vllm serve on all its GPUs, node 1 runs +# the trainer on all its GPUs. Roles rendezvous through the +# shared /scratchspace mount (node 0 publishes its serve +# address; the trainer signals completion). +# nodes >= 3 -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1) +# doing multi-node DDP. The head trainer (node 1, accelerate +# machine_rank 0) publishes its IP for accelerate's c10d +# rendezvous; all trainer nodes read both the serve address and +# the head-trainer address from /scratchspace. NOTE: only global +# rank 0 fetches hidden states from the single serve and +# broadcasts to the rest (DataLoaderDispatcher), so the single +# serve is the throughput ceiling — adding trainer nodes scales +# effective batch / compute, not data-production throughput. # # Env vars (required): # HF_MODEL_CKPT Target model path. Used by both vllm serve (as the @@ -56,7 +63,8 @@ # TRAIN_GPUS single-node only: CUDA_VISIBLE_DEVICES for the trainer. # default = all local GPUs except SERVE_GPU. # SERVE_ADVERTISE_IP multi-node only: address node 1 should dial. default is -# node 0's first `hostname -I` IP. +# node 0's routable IP (its resolved Slurm node name, else +# its first non-loopback / non-link-local IP). # # All script args are forwarded to launch_train.sh (typically: --config # plus OmegaConf dotlist overrides). @@ -112,7 +120,7 @@ export PATH=$PATH:/workspace/.local/bin ################################################################################################### -trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER +trap 'error_handler $0 $LINENO' ERR if [ -z "$HF_MODEL_CKPT" ]; then echo "ERROR: HF_MODEL_CKPT must be set." >&2; exit 1 @@ -154,11 +162,9 @@ launch_vllm() { # would expose *zero* GPUs (not all), so leave it unset to use the whole node. local -a gpu_env=() [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") - # Optional single-value memory knobs (each a space-free env value, so they - # survive nemo_run's unquoted `export FOO=value`; assembled into --flag value - # pairs here). --cpu-offload-gb spills N GB of weights/GPU to host RAM, the - # key lever for fitting a large model on too-few GPUs (slower, prefill-only - # use tolerates it). --max-model-len / --max-num-seqs trim KV/activation. + # Optional single-value memory knobs (see header), assembled into --flag + # value pairs. Each is a space-free env value so it survives nemo_run's + # unquoted `export FOO=value`. local -a opt_args=() [ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB") [ -n "${SERVE_MAX_MODEL_LEN:-}" ] && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN") @@ -222,28 +228,52 @@ wait_vllm_ready() { # per process; multiple workers would duplicate requests against the server. run_trainer_and_export() { local url="$1" cvd="$2" - echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all})..." + # Optional multi-node trainer routing (see dispatch section). Defaults keep + # the original single-trainer-node behavior: no --num_nodes, export on rank 0. + local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}" + echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..." # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all). local -a gpu_env=() [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") + # Engage accelerate multi-node routing only when >1 trainer node; a single + # trainer node keeps the original invocation (no --num_nodes) verbatim. + local -a mn_args=() + if [ "${num_tnodes}" -gt 1 ]; then + mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank") + fi "${gpu_env[@]}" bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \ "${SCRIPT_ARGS[@]}" \ + "${mn_args[@]}" \ data.streaming_server_url="$url" \ data.streaming_model_name="$HF_MODEL_CKPT" \ data.streaming_shared_storage_path="$SERVE_SCRATCH" \ training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; } + # Export only on the head trainer (machine_rank 0); non-head trainer nodes + # would race writing the same export dir. The export reads the saved + # checkpoint (training.output_dir), not the serve, so it is serve-independent. + if [ "${mrank}" -ne 0 ]; then + echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)." + return 0 + fi + + # Export the trained draft to HF format. Derive the checkpoint dir from the + # forwarded `training.output_dir=` dotlist (defaulting to the EAGLE + # convention) so EAGLE and DFlash runs each export their own output_dir. + # EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass + # --trust_remote_code; empty by default so EAGLE behavior is unchanged. + local out_dir + out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1) + out_dir="${out_dir:-/scratchspace/eagle3}" python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \ - --model_path /scratchspace/eagle3 \ - --export_path /scratchspace/export + --model_path "$out_dir" \ + --export_path "${EXPORT_PATH:-/scratchspace/export}" \ + ${EXPORT_EXTRA_ARGS:-} } # --------------------------------------------------------------------------- -# Topology dispatch (driven by the Slurm allocation, i.e. the yaml `nodes:`): -# SLURM_NNODES == 1 -> co-located: vllm on $SERVE_GPU, trainer on the rest. -# SLURM_NNODES >= 2 -> split: node 0 serves on all its GPUs, node 1 trains on -# all its GPUs; they rendezvous via /scratchspace. -# nemo_run runs this script once per node, so we branch on $SLURM_NODEID. +# Topology dispatch (see header): nemo_run runs this script once per node, so +# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads. # --------------------------------------------------------------------------- NNODES="${SLURM_NNODES:-1}" NODEID="${SLURM_NODEID:-0}" @@ -299,27 +329,55 @@ elif [ "$NODEID" -eq 0 ]; then while [ ! -f "$DONE_FILE" ]; do sleep 10; done echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)." -elif [ "$NODEID" -eq 1 ]; then - # ---------------------- multi-node: trainer node ----------------------- - # Release the serve node on any exit (success or failure) so it doesn't hang. - trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT +elif [ "$NODEID" -ge 1 ]; then + # -------------------- multi-node: trainer node(s) ---------------------- + # Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which + # map to 0-based accelerate machine ranks (head trainer = SLURM node 1). + NUM_TRAINER_NODES=$(( NNODES - 1 )) + TRAINER_RANK=$(( NODEID - 1 )) + TRAINER_ADDR_FILE="/scratchspace/.trainer_addr" + + # Only the head trainer (rank 0) signals the serve node to release on exit; + # a non-head node exiting first must NOT tear the serve down early. + if [ "$TRAINER_RANK" -eq 0 ]; then + trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT + rm -f "$TRAINER_ADDR_FILE" # clear stale rendezvous state + fi - echo "Trainer node waiting (up to ${SERVE_READY_TIMEOUT}s) for the serve address..." + echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..." for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do [ -f "$SERVE_ADDR_FILE" ] && break sleep 1 done [ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; } URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}" - wait_vllm_ready "$URL" || exit 1 - run_trainer_and_export "$URL" "" || exit 1 -else - # ------------- multi-node: extra nodes (unused by default) ------------- - echo "Node rank ${NODEID} idle: the default split uses node 0 = vllm serve, node 1 = trainer." - echo "Multi-node *training* (>1 trainer node) is not wired up yet; allocate exactly 2 nodes." - while [ ! -f "$DONE_FILE" ]; do sleep 10; done + if [ "$NUM_TRAINER_NODES" -le 1 ]; then + # Original 1-serve + 1-trainer topology: single-node DDP, unchanged. + run_trainer_and_export "$URL" "" || exit 1 + else + # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's + # c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse + # the serve node's IP-resolution logic (avoid link-local / loopback). + if [ "$TRAINER_RANK" -eq 0 ]; then + head_addr="${TRAINER_ADVERTISE_IP:-}" + [ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) + [ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1) + [ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}') + echo "$head_addr" > "$TRAINER_ADDR_FILE" + echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous." + else + echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..." + for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do + [ -f "$TRAINER_ADDR_FILE" ] && break + sleep 1 + done + [ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; } + fi + HEAD_IP=$(cat "$TRAINER_ADDR_FILE") + run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1 + fi fi ################################################################################################### diff --git a/tools/launcher/core.py b/tools/launcher/core.py index aa60bbad9e9..f6ae6493af3 100644 --- a/tools/launcher/core.py +++ b/tools/launcher/core.py @@ -286,6 +286,9 @@ def build_slurm_executor( retries=0, packager=packager, srun_args=slurm_config.srun_args, + # --segment=: pin all nodes into one topology block (one NVL72 / NVLink + # domain). None -> omitted, scheduler places freely (default behavior). + segment=slurm_config.segment, ) return executor diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml new file mode 100644 index 00000000000..b12c3b0f538 --- /dev/null +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml @@ -0,0 +1,64 @@ +# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4). +# +# Single-task pipeline that exercises the full convert→save→export path WITHOUT +# actually training. Uses the same `common/specdec/dflash_online_training.sh` +# entrypoint as a real DFlash run; all dry-run behaviour is expressed as dotlist +# overrides on `main.py` (shared with EAGLE3 — `--dry_run` is mode-agnostic): +# +# --dry_run → main.py skips trainer.train(), saves +# the (untrained) ModelOpt checkpoint +# to training.output_dir right after +# mtsp.convert(model, [("dflash", ...)]) +# data.offline_data_path= → DataArguments derives data.mode from +# the data-source fields, so setting an +# offline path makes mode='offline' → +# use_offline_training=True. Combined +# with use_fake_base_for_offline=true +# this loads a FakeBaseModel (only +# embed_tokens + lm_head), so the ~1T +# MoE base fits on a single GPU. The +# file is never read in --dry_run mode. +# model.trust_remote_code=true → Kimi-K2.5 (deepseek_v3 arch) ships a +# custom modeling file +# dflash.dflash_mask_token_id=163838 → Kimi-K2.5 has no dedicated mask token +# ([EOS]=163585, [PAD]=163839); 163838 is +# a reserved slot used as the DFlash mask +# (matches the real Kimi-K2.5 DFlash run) +# +# The dflash_online_training.sh export block then writes an HF-format DFlash draft +# to /scratchspace/dflash/exported-checkpoint-final with the correct architecture +# (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by +# design. Useful for smoke-testing the launcher / convert / export plumbing and +# validating downstream loaders without paying for a real training run. +# +# Usage: +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes + +job_name: Kimi-K2.5_DFlash_dryrun +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4/ + + # Convert → save → export (no training). + task_0: + script: common/specdec/dflash_online_training.sh + args: + - --dry_run + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml + - model.model_name_or_path=<> + - model.use_fake_base_for_offline=true + - model.trust_remote_code=true + - data.offline_data_path=/tmp/dryrun-placeholder + - training.output_dir=/scratchspace/dflash + - training.disable_tqdm=true + - dflash.dflash_mask_token_id=163838 + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml new file mode 100644 index 00000000000..ff99ae62c7f --- /dev/null +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml @@ -0,0 +1,131 @@ +# DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on +# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve + +# trainer split, same hardware reasoning — but trains a DFlash drafter instead of +# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the +# dflash recipe. +# +# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native +# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, +# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4, +# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes. +# +# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired +# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py +# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets +# dflash_offline=True and the DFlash module consumes the streamed hidden states +# (base_model_outputs) instead of running the fake base. The vLLM connector, +# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps +# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST +# captured layer into base_model_hidden_states (used for DFlash self-logit +# distillation) and the REST into aux_hidden_states (DFlash's concatenated +# target-layer features). So n_captured must be (num DFlash target layers + 1). +# +# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; +# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3): +# DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5) +# = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) = +# [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the +# dataset yields 5 aux layers, matching the 5-layer DFlash draft block. +# +# answer_only_loss: forced false here. DFlash's recipe default is true, which +# requires the tokenizer chat template to carry {% generation %} tags so the +# streaming dataset can derive an assistant-token mask; Kimi's template does not, +# and the streaming path (unlike online) does not inject data.chat_template. To +# train assistant-only later, supply a generation-tagged template and flip this on. +# +# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# SLURM_PARTITION=batch \ +# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ +# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# NEMORUN_HOME=$PWD +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \ +# identity=$HOME/.ssh/id_ecdsa detach=True --yes +# +# The export lands in /scratchspace/export. To benchmark it, point +# specdec_bench.yaml's --draft_model_dir there (or copy it under /hf-local). + +job_name: Kimi-K2.5-NVFP4_DFlash_streaming +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 + + # Step 1: Build input conversations (model-agnostic) + task_0: + script: common/eagle3/make_dataset.sh + args: + - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml + - --full-conversations + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), + # so request 4 even though make_dataset is CPU-only. + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node. + # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh): + # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ. + task_1: + script: common/eagle3/train_eagle_streaming.sh + args: + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml + - model.model_name_or_path=<> + - model.use_fake_base_for_offline=true + - model.trust_remote_code=true + - data.mode=streaming + - data.data_path=/scratchspace/data/train.jsonl + # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4 + # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker + # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. + - data.streaming_prefetch=8 + - training.output_dir=/scratchspace/dflash + # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. + - training.training_seq_len=4096 + - training.disable_tqdm=true + - training.num_train_epochs=1 + - training.max_steps=3000 + # See header: Kimi's template lacks {% generation %} tags; train on all tokens. + - training.answer_only_loss=false + # dflash.yaml sets report_to=tensorboard, but the vLLM container has no + # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable + # reporting (loss still prints to stdout via logging_steps). + - training.report_to=none + # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838 + # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run). + - dflash.dflash_mask_token_id=163838 + environment: + - HF_MODEL_CKPT: <> + # No spaces in values: nemo_run emits `export FOO=value` unquoted. + # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. + - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" + - SERVE_TP: "4" + # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. + - EXPORT_EXTRA_ARGS: "--trust_remote_code" + # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's + # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the + # training seq len and leave headroom for activation spikes. + - SERVE_MAX_MODEL_LEN: "4096" + # Small batches: smaller per-step MoE compute stays under the engine timeout. + - SERVE_MAX_NUM_SEQS: "4" + - SERVE_GPU_MEM_UTIL: "0.8" + - SERVE_READY_TIMEOUT: "2400" + - SERVE_EXTRA_ARGS: "--trust-remote-code" + # The killer was "RPC call to sample_tokens timed out" — a worker stalls on + # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default + # execute-model timeout, so EngineCore dies. Extend the timeouts that govern + # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" + - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" + slurm_config: + _factory_: "slurm_factory" + nodes: 2 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml new file mode 100644 index 00000000000..fb92ba11234 --- /dev/null +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -0,0 +1,133 @@ +# DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on +# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve + +# trainer split, same hardware reasoning — but trains a DFlash drafter instead of +# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the +# dflash recipe. +# +# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native +# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, +# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4, +# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes. +# +# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired +# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py +# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets +# dflash_offline=True and the DFlash module consumes the streamed hidden states +# (base_model_outputs) instead of running the fake base. The vLLM connector, +# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps +# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST +# captured layer into base_model_hidden_states (used for DFlash self-logit +# distillation) and the REST into aux_hidden_states (DFlash's concatenated +# target-layer features). So n_captured must be (num DFlash target layers + 1). +# +# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; +# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3): +# DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5) +# = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) = +# [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the +# dataset yields 5 aux layers, matching the 5-layer DFlash draft block. +# +# answer_only_loss: forced false here. DFlash's recipe default is true, which +# requires the tokenizer chat template to carry {% generation %} tags so the +# streaming dataset can derive an assistant-token mask; Kimi's template does not, +# and the streaming path (unlike online) does not inject data.chat_template. To +# train assistant-only later, supply a generation-tagged template and flip this on. +# +# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# SLURM_PARTITION=batch \ +# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ +# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# NEMORUN_HOME=$PWD +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \ +# identity=$HOME/.ssh/id_ecdsa detach=True --yes +# +# The export lands in /scratchspace/export. To benchmark it, point +# specdec_bench.yaml's --draft_model_dir there (or copy it under /hf-local). + +job_name: Kimi-K2.5-NVFP4_DFlash_streaming_multi_node +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 + + # Step 1: Build input conversations (model-agnostic) + task_0: + script: common/eagle3/make_dataset.sh + args: + - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml + - --full-conversations + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), + # so request 4 even though make_dataset is CPU-only. + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node. + # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh): + # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ. + task_1: + script: common/eagle3/train_eagle_streaming.sh + args: + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml + - model.model_name_or_path=<> + - model.use_fake_base_for_offline=true + - model.trust_remote_code=true + - data.mode=streaming + - data.data_path=/scratchspace/data/train.jsonl + # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4 + # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker + # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. + - data.streaming_prefetch=8 + - training.output_dir=/scratchspace/dflash + # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. + - training.training_seq_len=4096 + - training.disable_tqdm=true + - training.num_train_epochs=1 + - training.ar_validate_steps=500000 + - training.max_steps=500 + # See header: Kimi's template lacks {% generation %} tags; train on all tokens. + - training.answer_only_loss=false + # dflash.yaml sets report_to=tensorboard, but the vLLM container has no + # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable + # reporting (loss still prints to stdout via logging_steps). + - training.report_to=none + # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838 + # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run). + - dflash.dflash_mask_token_id=163838 + environment: + - HF_MODEL_CKPT: <> + # No spaces in values: nemo_run emits `export FOO=value` unquoted. + # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. + - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" + - SERVE_TP: "4" + # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. + - EXPORT_EXTRA_ARGS: "--trust_remote_code" + # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's + # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the + # training seq len and leave headroom for activation spikes. + - SERVE_MAX_MODEL_LEN: "4096" + # Small batches: smaller per-step MoE compute stays under the engine timeout. + - SERVE_MAX_NUM_SEQS: "4" + - SERVE_GPU_MEM_UTIL: "0.8" + - SERVE_READY_TIMEOUT: "2400" + - SERVE_EXTRA_ARGS: "--trust-remote-code" + # The killer was "RPC call to sample_tokens timed out" — a worker stalls on + # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default + # execute-model timeout, so EngineCore dies. Extend the timeouts that govern + # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" + - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" + slurm_config: + _factory_: "slurm_factory" + nodes: 3 + segment: 3 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml new file mode 100644 index 00000000000..a943f39c27e --- /dev/null +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml @@ -0,0 +1,81 @@ +# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM. +# +# Serves Kimi-K2.5-NVFP4 in-process (no HTTP server — specdec_bench drives an +# AsyncLLM) at TP=4 with expert parallelism, attaches a trained/exported DFLASH +# draft, and benchmarks speculative decoding on MT-Bench. Writes timing.json + +# aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl +# to /scratchspace/specdec_bench/. +# +# Hardware = GB200/Blackwell (HSG), same reasoning as hf_streaming_eagle3.yaml: +# Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the 192 GB/GPU of GB200; it fits +# at TP=4 on ONE 4-GPU node with no cpu-offload. On CW H100 it has no native FP4 +# and falls back to offload, so the working path is GB200. +# +# DFLASH specifics: +# - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8); +# --draft_length does NOT apply to DFLASH. To override sampling / engine args +# (e.g. speculative_num_draft_tokens, temperature), write a runtime-params +# yaml and add `- --runtime_params ` below — see +# examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern). +# - --draft_model_dir must point at a trained+exported HF-format DFLASH draft +# (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Edit the path +# below, or override on the CLI: pipeline.task_0.args[0]="--draft_model_dir /hf-local/" +# - Kimi needs --trust_remote_code for both tokenizer and model. +# +# NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so +# it runs without any data-prep step. To benchmark on SPEED-Bench instead, first +# generate + stage a split: +# python3 examples/specdec_bench/prepare_data.py --dataset speed --config all +# (splits: qualitative, throughput_1k, throughput_16k, ...) then swap the +# `--mtbench` arg for: +# - --dataset speed +# - --dataset_path modules/Model-Optimizer/examples/specdec_bench/data/speed/throughput_16k +# +# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH support; +# on GB200/aarch64 use an aarch64 vLLM image new enough for DFLASH (validated on +# a 0511 nightly). Override with: pipeline.task_0.slurm_config.container= +# +# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# SLURM_PARTITION=batch \ +# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ +# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# NEMORUN_HOME=$PWD +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/specdec_bench.yaml \ +# identity=$HOME/.ssh/id_ecdsa detach=True --yes + +job_name: Kimi-K2.5-NVFP4_DFLASH_specdec_bench + +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 + + task_0: + script: common/specdec_bench/run.sh + args: + # TODO: point at your trained + exported HF-format DFLASH draft checkpoint. + - --draft_model_dir /hf-local/nvidia/Kimi-K2.5-DFlash + - --speculative_algorithm DFLASH + - --engine VLLM + - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl + - --tp_size 4 + - --ep_size 4 + - --concurrency 32 + - --output_length 1024 + - --trust_remote_code + - --aa_timing + - --show_progress + - --save_dir /scratchspace/specdec_bench + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py index 8ecd51f6f86..0bcfff14ad9 100644 --- a/tools/launcher/slurm_config.py +++ b/tools/launcher/slurm_config.py @@ -48,6 +48,11 @@ class SlurmConfig: gpus_per_node: int = 1 time: str = "04:00:00" local: bool = False + # Slurm --segment=: force the job's nodes into a single topology block. + # On a topology/block cluster (e.g. GB200 NVL72, where one block = one NVLink + # domain) set this to the node count to keep all nodes in one NVL72 so + # inter-node traffic rides NVLink. None = let the scheduler place freely. + segment: Optional[int] = None @run.cli.factory @@ -68,6 +73,7 @@ def slurm_factory( srun_args: list[str] = ["--no-container-mount-home"], array: Optional[str] = None, time: str = "04:00:00", + segment: Optional[int] = None, ) -> SlurmConfig: """Generic Slurm factory — configure via environment variables or CLI overrides.""" return SlurmConfig( @@ -84,4 +90,5 @@ def slurm_factory( srun_args=srun_args, array=array, time=time, + segment=segment, ) From 6b8e784fa539dcbedfd9712cb204bc6c4d33986f Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 04:10:38 +0000 Subject: [PATCH 02/14] iterable dataset to map-style dataset Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/eagle_utils.py | 8 +- examples/speculative_decoding/main.py | 10 +- .../plugins/hf_streaming_dataset.py | 405 ++++++------------ .../speculative/plugins/hf_training_args.py | 3 + .../test_eagle_streaming.py | 3 +- .../plugins/test_hf_streaming_dataset.py | 240 +++++------ .../common/eagle3/train_eagle_streaming.sh | 10 +- .../hf_streaming_dflash_multi_node.yaml | 24 +- 8 files changed, 272 insertions(+), 431 deletions(-) diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py index f9675e54161..bcdcf15e8c2 100644 --- a/examples/speculative_decoding/eagle_utils.py +++ b/examples/speculative_decoding/eagle_utils.py @@ -88,14 +88,16 @@ def make_speculative_data_module( ds = load_dataset("json", data_files=data_args.data_path, split="train") if data_args.sample_size > 0: ds = ds.select(range(data_args.sample_size)) + # Map-style dataset: each rank fetches its own DistributedSampler shard. + # Fetch concurrency comes from the DataLoader's num_workers, not a config knob; + # shuffling/order is the sampler's job, so no seed is threaded here. + # ``server_urls`` accepts a comma-separated string for multi-server fan-out. streaming_cfg = EagleVllmStreamingConfig( - server_url=data_args.streaming_server_url, + server_urls=data_args.streaming_server_url, model=data_args.streaming_model_name, shared_storage_root=data_args.streaming_shared_storage_path, max_seq_len=train_len, answer_only_loss=answer_only_loss, - prefetch=data_args.streaming_prefetch, - seed=seed, ) train_dataset = EagleVllmStreamingDataset( entries=ds, diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 9b7a9f44d2e..4405bf0cd90 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -278,12 +278,12 @@ def train(): ): callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps)) if recipe.data.mode == "streaming": - # Skip-on-resume happens inside the dataset (no re-fetch from server); - # disable HF Trainer's own data skip so the offset isn't applied twice. - from modelopt.torch.speculative.plugins.hf_streaming_dataset import StreamingResumeCallback - + # The streaming dataset is map-style, so HF Trainer's default resume would + # fast-forward by re-iterating (= re-fetching) every consumed batch just to + # discard it, hammering the server. Disable the data skip: on resume, weights/ + # optimizer/global_step still restore from the checkpoint; only the data order + # restarts from the top (acceptable for single-epoch streaming). training_args.ignore_data_skip = True - callbacks.append(StreamingResumeCallback()) trainer = EagleTrainerWithAccLog( model=model, diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py index 31adbc96bf4..6b050f00add 100644 --- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py +++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py @@ -13,11 +13,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Streaming datasets that fetch per-sample hidden states from a running inference server. - -The base class :class:`StreamingDataset` owns all the backend-/algorithm- -agnostic plumbing: threading, queue, tokenization, the bounded sliding-window -producer, loss_mask alignment, and HTTP-client lifecycle. Concrete subclasses +"""Map-style datasets that fetch per-sample hidden states from a running inference server. + +This is the streaming sibling of :class:`OfflineSupervisedDataset`: instead of +reading a pre-dumped ``.pt`` file in ``__getitem__``, it fetches the per-sample +hidden states from a live inference server over HTTP. It is a plain +``torch.utils.data.Dataset`` (map-style), so DDP sharding is handled the standard +way -- HF Trainer wraps it in a ``DistributedSampler`` and each rank's DataLoader +calls ``__getitem__`` only for that rank's indices. Each rank therefore fetches +**only its own shard** (no rank-0 funnel, no broadcast); aggregate read bandwidth +scales with the number of trainer ranks. + +Fetch concurrency comes from the DataLoader's ``num_workers`` (each worker process +issues one blocking request at a time); there is no in-process producer thread. +Keep ``num_workers`` modest and bounded so the per-server in-flight request count +(``ranks-hitting-a-server x num_workers``) stays near the server's ``max_num_seqs`` +-- flooding a cold NVFP4 MoE server can stall a worker past vLLM's execute-model +timeout and kill EngineCore. + +The base class :class:`StreamingDataset` owns the backend-/algorithm-agnostic +plumbing: tokenization, the resample-on-failure ``__getitem__`` loop, the +consecutive-failure circuit breaker, and loss_mask alignment. Concrete subclasses specialize along two axes: - **Backend** (how to talk to the server, how to decode the response): override @@ -25,22 +41,14 @@ - **Algorithm** (how to shape the per-sample dict for the trainer): override :meth:`_format`. -:class:`EagleVllmStreamingDataset` is currently the only concrete -combination (Eagle algorithm × vLLM backend); future combinations live as -sibling subclasses. - -Requires ``dataloader_num_workers=0``: multiple workers would each spawn their -own asyncio loop and issue duplicate requests against the server. +:class:`EagleVllmStreamingDataset` is currently the only concrete combination +(Eagle algorithm x vLLM backend); future combinations live as sibling subclasses. """ from __future__ import annotations -import asyncio import contextlib import os -import queue -import random -import threading from pathlib import Path from typing import TypedDict @@ -48,17 +56,13 @@ import torch from pydantic import BaseModel, ConfigDict, Field, field_validator from safetensors import safe_open -from torch.utils.data import IterableDataset, get_worker_info -from transformers import TrainerCallback +from torch.utils.data import Dataset from transformers.trainer_pt_utils import LabelSmoother -from modelopt.torch.utils import distributed as dist_utils from modelopt.torch.utils import print_rank_0, warn_rank_0 IGNORE_TOKEN_ID = LabelSmoother.ignore_index -_SENTINEL = object() - def _tokenize_with_loss_mask( tokenizer, @@ -104,33 +108,33 @@ def _tokenize_with_loss_mask( class StreamingConfig(BaseModel): """Static tuning knobs for :class:`StreamingDataset`. - Bundles the rarely-changing settings (loss masking, concurrency, HTTP timeout) - so the dataset ctor takes only ``entries`` + ``tokenizer`` + this config. + Bundles the rarely-changing settings (loss masking, HTTP timeout) so the dataset + ctor takes only ``entries`` + ``tokenizer`` + this config. """ model_config = ConfigDict(extra="forbid") answer_only_loss: bool = False - prefetch: int = Field(default=64, ge=1) request_timeout: float = Field(default=600.0, gt=0) # Token-level cap applied during tokenization (right-truncation). Must hold # ``max_seq_len <= vllm.max_model_len``. ``None`` disables truncation. max_seq_len: int | None = None - # Must be identical on every rank — the dataset shuffles with this seed then - # stripes by rank, so equal seeds are required for the partition to be disjoint. - seed: int = 0 - # Circuit breaker: raise after this many consecutive _fetch failures so a dead - # server doesn't silently drain the corpus. + # Circuit breaker: raise after this many consecutive _fetch failures (per worker + # process) so a dead server doesn't silently resample the whole corpus. fail_after_consecutive_skips: int = Field(default=16, ge=1) -class StreamingDataset(IterableDataset): - """Base class: stream per-sample hidden states from a running inference server. +class StreamingDataset(Dataset): + """Base class: map-style dataset that streams per-sample hidden states from a server. Backend- and algorithm-agnostic; subclasses implement :meth:`_fetch` (backend) and :meth:`_format` (algorithm). The dict shape exchanged between them is the algorithm-level contract, declared as a ``TypedDict`` in :attr:`fetch_payload_cls` and validated against the actual ``_fetch`` output on every sample. + + ``__getitem__`` must always return a valid sample for the sampler's index, so it + resamples forward through the corpus on an unfit entry or a fetch failure rather + than skipping (a skip would shrink the batch and desync DDP). """ config_cls: type[StreamingConfig] = StreamingConfig @@ -145,217 +149,78 @@ def __init__( tokenizer, config: StreamingConfig | None = None, ): - """Hold the *full* corpus on every rank; fetch lazily, rank 0 only. + """Hold the full corpus; fetch lazily, per index, in ``__getitem__``. - DDP sharding is delegated to Accelerate's ``DataLoaderDispatcher``: rank 0 - consumes the dataset and broadcasts each batch; non-zero ranks rely on - :meth:`__iter__`'s rank guard. The corpus is held in full on every rank -- - the dispatcher reads only rank 0's stream, so sharding here would just - shrink that view. Shuffling with ``config.seed`` runs on every rank so - the order is reproducible regardless of which rank ends up fetching. + DDP sharding is handled by HF Trainer's ``DistributedSampler``: each rank's + DataLoader requests only its own indices, so each rank fetches only its + shard. The corpus order is left as given -- the sampler shuffles indices + (seeded by ``training_args.seed``), so no shuffle is needed here. Args: entries: Untokenized per-sample dicts from the input jsonl. Schema is - subclass-defined (see :meth:`_tokenize_entry`); passed through to :meth:`_fetch`. + subclass-defined (see :meth:`_tokenize_entry`); passed to :meth:`_fetch`. tokenizer: HF tokenizer; used for client-side tokenization and the server/client loss-mask alignment in :meth:`_fetch`. - config: Tuning knobs (prefetch, timeout, seed, ...); defaults to + config: Tuning knobs (timeout, answer_only_loss, ...); defaults to ``self.config_cls()``. See :class:`StreamingConfig`. """ if not entries: raise ValueError("entries is empty") self.tokenizer = tokenizer self.config = config if config is not None else self.config_cls() - # One-shot, consumed by the next __iter__. - self._resume_skip = 0 - - indices = list(range(len(entries))) - random.Random(self.config.seed).shuffle(indices) - self.entries = [entries[i] for i in indices] - rank, world = dist_utils.rank(), dist_utils.size() - print_rank_0( - f"[{type(self).__name__}] rank {rank}/{world}: " - f"holds {len(self.entries)} entries (full corpus; rank 0 fetches)" - ) + # Materialize to a plain list so DataLoader worker processes fork it cheaply. + self.entries = list(entries) + # Per-process consecutive-failure counter for the circuit breaker. Reset to 0 + # on every successful fetch; tripped only by fetch failures (not unfit entries). + self._consecutive_fail = 0 + print_rank_0(f"[{type(self).__name__}] map-style dataset over {len(self.entries)} entries") def __len__(self) -> int: return len(self.entries) - def set_resume_position(self, skip: int) -> None: - """Drop the first ``skip`` entries on the next ``__iter__`` without fetching. + def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: + """Tokenize -> fetch -> format the sample at ``idx``, resampling on miss. - One-shot; cleared once iteration starts. Used by - :class:`StreamingResumeCallback` on HF Trainer checkpoint resume so the - server is not re-queried for already-consumed samples. + Always returns a valid sample. An unfit entry (tokenization yields nothing) or + a fetch failure causes a forward probe to the next index; fetch failures bump + the circuit breaker, which raises once ``fail_after_consecutive_skips`` is hit. """ - self._resume_skip = skip - - @staticmethod - def _verify_accelerate_dispatcher() -> None: - """Raise if Accelerate is initialized for DDP with ``dispatch_batches=False``. - - Best-effort: no-op when Accelerate isn't installed/initialized or in single-process. - """ - try: - from accelerate.state import AcceleratorState - except ImportError: - return - if not AcceleratorState._shared_state: - return - state = AcceleratorState() - if getattr(state, "num_processes", 1) <= 1: - return - # Field moved to ``dataloader_config`` in newer Accelerate; check both. - dispatch = getattr(state, "dispatch_batches", None) - if dispatch is None: - dl_cfg = getattr(state, "dataloader_config", None) - if dl_cfg is not None: - dispatch = getattr(dl_cfg, "dispatch_batches", None) - if dispatch is False: - raise RuntimeError( - "StreamingDataset requires Accelerate's DataLoaderDispatcher " - "(dispatch_batches=True); got False — non-zero ranks would receive no data." - ) - - def __iter__(self): - # IterableDataset with DataLoader workers > 0 would spawn one asyncio loop - # per worker, each issuing the full request set — silent Nx duplication - # against the server. Fail loud instead. - if get_worker_info() is not None: - raise RuntimeError( - f"{type(self).__name__} requires dataloader_num_workers=0; " - "multiple workers would each spawn an asyncio loop and duplicate requests." - ) - # Without dispatch_batches the rank-0 guard below would silently starve - # non-zero ranks; fail loud instead. - self._verify_accelerate_dispatcher() - # Only rank 0 fetches; non-zero ranks receive batches via the dispatcher's broadcast. - if dist_utils.rank() != 0: - return - # Fresh producer per __iter__ call so re-iteration (which shouldn't - # happen in 1-epoch streaming) at least doesn't deadlock. - q: queue.Queue = queue.Queue(maxsize=self.config.prefetch) - stop = threading.Event() - skip = self._resume_skip - self._resume_skip = 0 # one-shot - entries = self.entries[skip:] if skip else self.entries - - def run(): + n = len(self.entries) + for offset in range(n): + entry = self.entries[(idx + offset) % n] + sample = self._tokenize_entry(entry) + if sample is None: + continue # entry unfit pre-fetch; server not at fault, try the next one try: - asyncio.run(self._produce(q, stop, entries)) + fetched = self._fetch(sample) except Exception as e: - q.put(e) # surface to consumer - finally: - q.put(_SENTINEL) - - thread = threading.Thread(target=run, daemon=True) - thread.start() - - try: - while True: - item = q.get() - if item is _SENTINEL: - break - if isinstance(item, Exception): - raise item - yield item - finally: - stop.set() - # Drain any leftover items so producer can exit - with contextlib.suppress(queue.Empty): - while True: - q.get_nowait() - - async def _produce(self, q: queue.Queue, stop: threading.Event, entries): - """Stream ``entries`` through a sliding window of at most ``prefetch`` in-flight tasks. - - Counter is local (single writer); ``_process`` tasks report outcome via return value. - The circuit breaker has *batch-level* (not per-task) granularity: when - ``asyncio.wait(FIRST_COMPLETED)`` returns several tasks in the same loop turn, - ``consecutive_skips`` reflects set-iteration order over ``done`` -- sufficient - for "detect a dead server" but not strict temporal ordering. - - Args: - q: Bounded queue drained by :meth:`__iter__`; full queue backpressures fetching. - stop: Set by the consumer to request shutdown; checked between samples. - entries: Resume-adjusted slice of ``self.entries`` to fetch this iteration. - """ - timeout = httpx.Timeout(self.config.request_timeout, connect=10.0) - threshold = self.config.fail_after_consecutive_skips - consecutive_skips = 0 - async with httpx.AsyncClient(timeout=timeout) as client: - pending: set[asyncio.Task] = set() - entries_iter = iter(entries) - exhausted = False - try: - while not stop.is_set(): - while len(pending) < self.config.prefetch and not exhausted: - try: - entry = next(entries_iter) - except StopIteration: - exhausted = True - break - pending.add(asyncio.create_task(self._process(client, entry, q, stop))) - if not pending: - break - done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) - for task in done: - outcome = task.result() # re-raises unexpected errors - if outcome is True: - consecutive_skips = 0 - elif outcome is False: - consecutive_skips += 1 - # None -> entry unfit pre-fetch; server not at fault - if consecutive_skips >= threshold: - raise RuntimeError( - f"{consecutive_skips} consecutive _fetch failures " - f"in {type(self).__name__}; server likely down." - ) - finally: - for task in pending: - task.cancel() - if pending: - await asyncio.gather(*pending, return_exceptions=True) - - async def _process( - self, - client: httpx.AsyncClient, - entry: dict, - q: queue.Queue, - stop: threading.Event, - ) -> bool | None: - """Tokenize -> fetch -> format -> enqueue. - - Returns True on enqueue, False on fetch failure (bumps breaker), None - when the entry is unfit pre-fetch (no breaker effect). - """ - if stop.is_set(): - return None - sample = await asyncio.to_thread(self._tokenize_entry, entry) - if sample is None: - return None - try: - fetched = await self._fetch(client, sample) - except Exception as e: - warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}") - return False - if fetched is None: - return False - if self.fetch_payload_cls is not None: - # ``__required_keys__`` is a TypedDict runtime attribute mypy doesn't - # track on ``type``; the assignment site guarantees it's a TypedDict. - required: frozenset[str] = self.fetch_payload_cls.__required_keys__ # type: ignore[attr-defined] - missing = required - set(fetched) - if missing: - raise RuntimeError( - f"{type(self).__name__}._fetch missing required keys {missing}; " - f"{self.fetch_payload_cls.__name__} requires " - f"{set(required)}, got {set(fetched)}" - ) - data = self._format(fetched) - # Blocking put -> backpressure when trainer is slow. - await asyncio.to_thread(q.put, data) - return True + warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}") + fetched = None + if fetched is None: + self._consecutive_fail += 1 + if self._consecutive_fail >= self.config.fail_after_consecutive_skips: + raise RuntimeError( + f"{self._consecutive_fail} consecutive _fetch failures in " + f"{type(self).__name__}; server likely down." + ) + continue # resample forward + self._consecutive_fail = 0 + if self.fetch_payload_cls is not None: + # ``__required_keys__`` is a TypedDict runtime attribute mypy doesn't + # track on ``type``; the assignment site guarantees it's a TypedDict. + required: frozenset[str] = self.fetch_payload_cls.__required_keys__ # type: ignore[attr-defined] + missing = required - set(fetched) + if missing: + raise RuntimeError( + f"{type(self).__name__}._fetch missing required keys {missing}; " + f"{self.fetch_payload_cls.__name__} requires " + f"{set(required)}, got {set(fetched)}" + ) + return self._format(fetched) + raise RuntimeError( + f"{type(self).__name__}: no fetchable sample found in the entire corpus " + f"({n} entries) starting at index {idx}." + ) def _tokenize_entry(self, entry: dict) -> dict | None: """Tokenize a single entry. @@ -382,14 +247,14 @@ def _tokenize_entry(self, entry: dict) -> dict | None: "loss_mask": loss_mask, } - async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> dict | None: + def _fetch(self, sample: dict) -> dict | None: """Backend hook: send the request and decode the server's response. - Override in subclass. Any scratch resources (per-request files, mmap'd - buffers) must be released before returning. + Override in subclass. Synchronous (called from a DataLoader worker). Any + scratch resources (per-request files, mmap'd buffers) must be released before + returning. Args: - client: Shared async HTTP client owned by :meth:`_produce`. sample: :meth:`_tokenize_entry` output: ``{"cid": str, "token_ids": list[int], "loss_mask": LongTensor[seq]}``. @@ -431,16 +296,24 @@ class EagleFetchPayload(TypedDict): class EagleVllmStreamingConfig(StreamingConfig): """Adds vLLM endpoint info on top of :class:`StreamingConfig`.""" - server_url: str + # One or more vLLM endpoints; fetches round-robin across them so a single fetcher + # can spread load over several server replicas. Accepts a list or a single + # (optionally comma-separated) string. + server_urls: list[str] model: str - # Allowlist for ``hidden_states_path`` returned by the server. Must match the - # connector's ``shared_storage_path``; out-of-tree paths are rejected. + # Allowlist for ``hidden_states_path`` returned by the server. Must match (or be a + # parent of) the connector's ``shared_storage_path``; out-of-tree paths are rejected. shared_storage_root: str - @field_validator("server_url") + @field_validator("server_urls", mode="before") @classmethod - def _strip_trailing_slash(cls, v: str) -> str: - return v.rstrip("/") + def _normalize_urls(cls, v): + if isinstance(v, str): + v = v.split(",") + urls = [u.strip().rstrip("/") for u in v if u and str(u).strip()] + if not urls: + raise ValueError("server_urls must contain at least one non-empty URL") + return urls @field_validator("shared_storage_root") @classmethod @@ -449,7 +322,7 @@ def _resolve_root(cls, v: str) -> str: class EagleVllmStreamingDataset(StreamingDataset): - """Eagle (algorithm) × vLLM (backend). + """Eagle (algorithm) x vLLM (backend). Talks to a ``vllm serve`` instance configured with the ``ExampleHiddenStatesConnector`` KV-transfer connector (the server dumps captured @@ -467,13 +340,38 @@ def __init__( tokenizer, config: EagleVllmStreamingConfig, ): - """Same as the base; ``config`` must include ``server_url`` and ``model``.""" + """Same as the base; ``config`` must include ``server_urls`` and ``model``.""" super().__init__(entries=entries, tokenizer=tokenizer, config=config) self.config: EagleVllmStreamingConfig = config - async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> EagleFetchPayload | None: - r = await client.post( - f"{self.config.server_url}/v1/completions", + def _client(self) -> httpx.Client: + """Lazily build a per-process HTTP client and round-robin cursor. + + DataLoader workers are forked processes; httpx connection pools must not be + shared across a fork, so each process gets its own client (and its own + round-robin cursor over ``server_urls``), keyed by PID. + """ + pid = os.getpid() + if getattr(self, "_client_pid", None) != pid: + self._http = httpx.Client( + timeout=httpx.Timeout(self.config.request_timeout, connect=10.0) + ) + self._client_pid = pid + self._rr = 0 + return self._http + + def _next_url(self) -> str: + """Round-robin the next server URL (per-process cursor).""" + urls = self.config.server_urls + url = urls[self._rr % len(urls)] + self._rr += 1 + return url + + def _fetch(self, sample: dict) -> EagleFetchPayload | None: + client = self._client() + url = self._next_url() + r = client.post( + f"{url}/v1/completions", json={ "model": self.config.model, "prompt": sample["token_ids"], @@ -492,7 +390,7 @@ async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> EagleFetchPay f"[streaming] path outside shared_storage_root for {sample['cid']}: {path!r}" ) return None - token_ids, hidden_states = await asyncio.to_thread(self._load_safetensors, path) + token_ids, hidden_states = self._load_safetensors(path) # Contract: the server tokenization is the client's pre-tokenized prompt # verbatim, plus at most one decode-step token at the tail (from # ``max_tokens=1``). Anything else (e.g. server-side BOS prepend, chat @@ -573,36 +471,3 @@ def _format(self, fetched: EagleFetchPayload) -> dict[str, torch.Tensor]: "loss_mask": loss_mask, "labels": labels, } - - -class StreamingResumeCallback(TrainerCallback): - """Fast-forward :class:`StreamingDataset` past consumed samples on resume. - - Dispatcher pulls a *global* batch per micro-step, hence the ``world_size`` factor. - Requires ``training_args.ignore_data_skip=True``; round-trips only when - ``world_size`` and ``config.seed`` match the original run. - """ - - def on_train_begin(self, args, state, control, train_dataloader=None, **kwargs): - """Push the skip count into the dataset when resuming mid-training.""" - if state.global_step <= 0 or train_dataloader is None: - return - ds = train_dataloader.dataset - if not hasattr(ds, "set_resume_position"): - return - if not getattr(args, "ignore_data_skip", False): - raise RuntimeError( - "StreamingResumeCallback requires ignore_data_skip=True to avoid " - "double-skipping on resume." - ) - consumed = ( - state.global_step - * args.per_device_train_batch_size - * dist_utils.size() - * args.gradient_accumulation_steps - ) - ds.set_resume_position(consumed) - print_rank_0( - f"[StreamingResumeCallback] resuming at global_step={state.global_step}; " - f"skipping {consumed} entries" - ) diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py index a65a3183a05..a9670ec1efd 100644 --- a/modelopt/torch/speculative/plugins/hf_training_args.py +++ b/modelopt/torch/speculative/plugins/hf_training_args.py @@ -62,6 +62,9 @@ class DataArguments(BaseModel): sample_size: int = -1 streaming_server_url: str | None = None streaming_model_name: str | None = None + # Deprecated / no-op: the streaming dataset is map-style now, so fetch concurrency + # comes from the DataLoader's ``dataloader_num_workers``, not this knob. Kept so + # existing yamls that set ``data.streaming_prefetch`` still validate. streaming_prefetch: int = Field(default=64, ge=1) # Mirror of the vLLM connector's ``shared_storage_path``; trainer-side allowlist. streaming_shared_storage_path: str | None = None diff --git a/tests/examples/speculative_decoding/test_eagle_streaming.py b/tests/examples/speculative_decoding/test_eagle_streaming.py index 291aa0f7929..3c8f7573957 100644 --- a/tests/examples/speculative_decoding/test_eagle_streaming.py +++ b/tests/examples/speculative_decoding/test_eagle_streaming.py @@ -118,13 +118,12 @@ def test_streaming_eagle_training( f"data.streaming_server_url={server_url}", f"data.streaming_model_name={tiny_llama_path}", f"data.streaming_shared_storage_path={scratch}", - "data.streaming_prefetch=2", f"training.output_dir={output_dir}", "training.num_train_epochs=1", "training.learning_rate=1e-5", "training.training_seq_len=32", "training.save_steps=1", - "training.dataloader_num_workers=0", # enforced by StreamingDataset + "training.dataloader_num_workers=0", # map-style; 0 keeps this test single-process *_TINY_EAGLE_ARCH, ] diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py index 27210ee7286..4d094171967 100644 --- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py +++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py @@ -13,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for StreamingDataset's DDP contract. +"""Tests for the map-style StreamingDataset. -We do not spin up real torch.distributed; instead we monkeypatch the helper that -reads rank/world_size. Sharding itself is delegated to Accelerate's -``DataLoaderDispatcher`` (every rank holds the full corpus; only rank 0 iterates). -These tests check the corpus-handling and rank-0-only-iter properties on which -that delegation relies. +The dataset is a plain ``torch.utils.data.Dataset``: DDP sharding is HF Trainer's +job (``DistributedSampler``), so there is no rank/dispatch logic to test here. +These tests cover the ``__getitem__`` contract: resample-on-miss, the +consecutive-failure circuit breaker, and the vLLM wire-format -> batch-dict chain. """ from pathlib import Path @@ -30,7 +29,7 @@ import safetensors.torch import torch -# hf_streaming_dataset imports TrainerCallback / LabelSmoother at module scope. +# hf_streaming_dataset imports LabelSmoother at module scope. pytest.importorskip("transformers") from modelopt.torch.speculative.plugins import hf_streaming_dataset @@ -47,133 +46,100 @@ def _entries(n: int) -> list[dict]: return [{"id": i} for i in range(n)] -@pytest.fixture -def patch_dist(monkeypatch): - """Return a setter; tests call it with (rank, world) to simulate a DDP rank. - - Patches ``modelopt.torch.utils.distributed.rank/size`` as imported into the - streaming dataset module (``dist_utils``). The dataset reads these in - ``__init__`` for logging and in ``__iter__`` for the rank-0-only gate. - """ - - def _set(rank: int, world: int): - # ``is_master`` etc. call ``rank(group=...)`` / ``size(group=...)`` — match the signature. - monkeypatch.setattr(hf_streaming_dataset.dist_utils, "rank", lambda group=None: rank) - monkeypatch.setattr(hf_streaming_dataset.dist_utils, "size", lambda group=None: world) - - return _set - - -def _entry_ids(ds: StreamingDataset) -> list[int]: - return [e["id"] for e in ds.entries] - - -@pytest.mark.parametrize("world", [1, 2, 3, 8]) -def test_every_rank_holds_full_corpus(patch_dist, world): - """Each rank must see all entries — Accelerate's dispatcher does the sharding, - so any per-rank pre-shard here would shrink rank 0's view to 1/N and break - ``max_steps``. - """ - corpus = _entries(100) - for rank in range(world): - patch_dist(rank, world) - ds = StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=42)) - assert sorted(_entry_ids(ds)) == list(range(100)) - - -def test_same_seed_same_order(patch_dist): - """The shuffle is what makes rank 0's fetch order deterministic across reruns.""" - corpus = _entries(50) - patch_dist(0, 1) - a = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=7))) - b = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=7))) - assert a == b - - -def test_different_seed_different_order(patch_dist): - """Sanity: changing the seed actually reshuffles (else seed is vacuous).""" - corpus = _entries(50) - patch_dist(0, 1) - a = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=1))) - b = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=2))) - assert a != b - assert sorted(a) == sorted(b) - - -def test_non_rank_zero_iter_is_empty(patch_dist): - """Non-zero ranks must yield nothing on ``__iter__`` — their producer would burn - server requests that ``DataLoaderDispatcher`` would discard.""" - corpus = _entries(8) - patch_dist(2, 4) - ds = StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=0)) - assert list(iter(ds)) == [] - - -def test_iter_rejects_dataloader_workers(patch_dist, monkeypatch): - """Iterating from within a DataLoader worker must raise — multiple workers would - each spawn an asyncio loop and N× the request load on the server.""" - patch_dist(0, 1) - ds = StreamingDataset(_entries(4), tokenizer=MagicMock(), config=StreamingConfig(seed=0)) - # Pretend we're inside a DataLoader worker. - monkeypatch.setattr(hf_streaming_dataset, "get_worker_info", lambda: MagicMock()) - with pytest.raises(RuntimeError, match="dataloader_num_workers=0"): - next(iter(ds)) - - -def test_empty_corpus_raises(patch_dist): - patch_dist(0, 1) +def test_empty_corpus_raises(): with pytest.raises(ValueError, match="entries is empty"): StreamingDataset([], tokenizer=MagicMock(), config=StreamingConfig()) -def test_set_resume_position_skips_entries_without_fetching(patch_dist): - """Resume should fast-forward inside the dataset without invoking _fetch. +def test_len_matches_corpus(): + ds = StreamingDataset(_entries(37), tokenizer=MagicMock(), config=StreamingConfig()) + assert len(ds) == 37 - Verifies the contract relied on by StreamingResumeCallback: skipped entries - are not sent to the server, so resume costs nothing on the inference side. - """ - patch_dist(0, 1) - fetched_ids: list[int] = [] + +def test_getitem_resamples_past_unfit_entries(): + """An unfit entry (tokenize -> None) must not be returned; __getitem__ probes + forward to the next fetchable index and returns that instead.""" + fetched_cids: list[int] = [] class _Track(StreamingDataset): def _tokenize_entry(self, entry): + # Even ids are "unfit" (e.g. truncated away / missing fields). + if entry["id"] % 2 == 0: + return None return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} - async def _fetch(self, client, sample): - fetched_ids.append(int(sample["cid"])) + def _fetch(self, sample): + fetched_cids.append(int(sample["cid"])) + return {"ok": True} - corpus = _entries(10) - ds = _Track(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=0, prefetch=2)) - ds.set_resume_position(5) - list(ds) + def _format(self, fetched): + return {"sentinel": fetched_cids[-1]} - expected = {e["id"] for e in ds.entries[5:]} - assert set(fetched_ids) == expected - # _resume_skip is one-shot - assert ds._resume_skip == 0 + ds = _Track(_entries(10), tokenizer=MagicMock(), config=StreamingConfig()) + # idx 0 is unfit -> resamples forward to idx 1. + out = ds[0] + assert out == {"sentinel": 1} + assert fetched_cids == [1] + # An already-fit index is returned directly. + assert ds[3] == {"sentinel": 3} -def test_circuit_breaker_trips_on_consecutive_fetch_failures(patch_dist): - """When _fetch keeps failing, the producer raises after the threshold so the - trainer sees a clear error instead of a silent empty epoch.""" - patch_dist(0, 1) +def test_circuit_breaker_trips_on_consecutive_failures(): + """When _fetch keeps failing, __getitem__ raises after the threshold instead of + silently resampling the whole corpus.""" threshold = 3 class _AlwaysFails(StreamingDataset): - # Bypass tokenization so we don't need a real tokenizer. def _tokenize_entry(self, entry): return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} - async def _fetch(self, client, sample): + def _fetch(self, sample): raise RuntimeError("simulated server failure") ds = _AlwaysFails( _entries(20), tokenizer=MagicMock(), - config=StreamingConfig(seed=0, prefetch=2, fail_after_consecutive_skips=threshold), + config=StreamingConfig(fail_after_consecutive_skips=threshold), ) with pytest.raises(RuntimeError, match="consecutive _fetch failures"): - list(ds) + ds[0] + + +def test_fetch_returning_none_exhausts_then_raises(): + """If every entry's fetch yields None (e.g. all rejected), __getitem__ raises a + clear 'no fetchable sample' error rather than hanging or returning junk.""" + + class _AllNone(StreamingDataset): + def _tokenize_entry(self, entry): + return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} + + def _fetch(self, sample): + return None + + ds = _AllNone( + _entries(4), + tokenizer=MagicMock(), + config=StreamingConfig(fail_after_consecutive_skips=100), + ) + with pytest.raises(RuntimeError, match="no fetchable sample"): + ds[0] + + +def test_server_urls_normalization(): + """server_urls accepts a single string, a comma-separated string, or a list, and + strips trailing slashes.""" + + def _urls(v): + cfg = EagleVllmStreamingConfig( + server_urls=v, model="m", shared_storage_root=str(Path.cwd()) + ) + return cfg.server_urls + + assert _urls("http://a:8000/") == ["http://a:8000"] + assert _urls("http://a:8000, http://b:8000/") == ["http://a:8000", "http://b:8000"] + assert _urls(["http://a:8000", "http://b:8000"]) == ["http://a:8000", "http://b:8000"] + with pytest.raises(ValueError, match="at least one non-empty URL"): + EagleVllmStreamingConfig(server_urls="", model="m", shared_storage_root=".") def _write_canned_safetensors(path: Path, seq: int, n_layers: int, hidden: int) -> None: @@ -196,14 +162,23 @@ def _tokenizer_returning(seq: int) -> MagicMock: return tok -def test_eagle_vllm_dataset_end_to_end(tmp_path, patch_dist, monkeypatch): +def _patch_sync_client(monkeypatch, handler): + """Route the dataset's per-process httpx.Client through a MockTransport handler.""" + real_client = httpx.Client + + def mock_client(*args, **kwargs): + kwargs["transport"] = httpx.MockTransport(handler) + return real_client(*args, **kwargs) + + monkeypatch.setattr(hf_streaming_dataset.httpx, "Client", mock_client) + + +def test_eagle_vllm_dataset_end_to_end(tmp_path, monkeypatch): """Drive EagleVllmStreamingDataset against an in-process mocked server. - Verifies that the wire-format → tensor → batch-dict chain produces dicts - matching what EagleOfflineDataCollator expects, and that scratch files - are cleaned up after each fetch. + Verifies the wire-format -> tensor -> batch-dict chain produces dicts matching + what EagleOfflineDataCollator expects, and that scratch files are cleaned up. """ - patch_dist(0, 1) seq, n_layers, hidden = 8, 3, 16 # n_layers = 1 final + 2 aux scratch = tmp_path / "vllm_scratch" scratch.mkdir() @@ -219,37 +194,25 @@ def handler(request: httpx.Request) -> httpx.Response: json={"kv_transfer_params": {"hidden_states_path": str(path)}}, ) - real_async_client = httpx.AsyncClient - - def mock_async_client(*args, **kwargs): - kwargs["transport"] = httpx.MockTransport(handler) - return real_async_client(*args, **kwargs) - - monkeypatch.setattr(hf_streaming_dataset.httpx, "AsyncClient", mock_async_client) + _patch_sync_client(monkeypatch, handler) n_entries = 4 entries = [ - { - "conversation_id": f"c-{i}", - "messages": [{"role": "user", "content": "x"}], - } + {"conversation_id": f"c-{i}", "messages": [{"role": "user", "content": "x"}]} for i in range(n_entries) ] ds = EagleVllmStreamingDataset( entries=entries, tokenizer=_tokenizer_returning(seq), config=EagleVllmStreamingConfig( - server_url="http://mock:8000", + server_urls="http://mock:8000", model="mock-model", shared_storage_root=str(scratch), - prefetch=2, - seed=0, ), ) - batches = list(ds) + batches = [ds[i] for i in range(n_entries)] - assert len(batches) == n_entries expected_keys = { "input_ids", "base_model_hidden_states", @@ -275,9 +238,9 @@ def mock_async_client(*args, **kwargs): assert list(scratch.iterdir()) == [], "scratch files must be unlinked after fetch" -def test_path_outside_shared_storage_root_is_rejected(tmp_path, patch_dist, monkeypatch): - """Out-of-root path from server is not opened or unlinked.""" - patch_dist(0, 1) +def test_path_outside_shared_storage_root_is_rejected(tmp_path, monkeypatch): + """Out-of-root path from the server is not opened or unlinked; the fetch yields + None, so the single-entry corpus is exhausted and __getitem__ raises.""" seq, n_layers, hidden = 8, 3, 16 allowed = tmp_path / "allowed" allowed.mkdir() @@ -292,26 +255,19 @@ def handler(request: httpx.Request) -> httpx.Response: json={"kv_transfer_params": {"hidden_states_path": str(forbidden)}}, ) - real_async_client = httpx.AsyncClient - - def mock_async_client(*args, **kwargs): - kwargs["transport"] = httpx.MockTransport(handler) - return real_async_client(*args, **kwargs) - - monkeypatch.setattr(hf_streaming_dataset.httpx, "AsyncClient", mock_async_client) + _patch_sync_client(monkeypatch, handler) ds = EagleVllmStreamingDataset( entries=[{"conversation_id": "c-0", "messages": [{"role": "user", "content": "x"}]}], tokenizer=_tokenizer_returning(seq), config=EagleVllmStreamingConfig( - server_url="http://mock:8000", + server_urls="http://mock:8000", model="mock-model", shared_storage_root=str(allowed), fail_after_consecutive_skips=100, - prefetch=1, - seed=0, ), ) - assert list(ds) == [] + with pytest.raises(RuntimeError, match="no fetchable sample"): + ds[0] assert forbidden.exists(), "rejected path must not be unlinked" diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index 4a8dc8bbacf..a65cbc2bc41 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -224,8 +224,11 @@ wait_vllm_ready() { # Run the trainer then export the HF checkpoint. # $1 = streaming server base URL $2 = CUDA_VISIBLE_DEVICES ("" -> all) -# dataloader_num_workers must be 0: the streaming dataset owns one asyncio loop -# per process; multiple workers would duplicate requests against the server. +# The streaming dataset is map-style now, so fetch concurrency comes from the +# DataLoader's workers (each worker = one in-flight request). STREAMING_NUM_WORKERS +# sets that; keep it modest so (ranks-per-server x workers) stays near the server's +# max_num_seqs (flooding a cold NVFP4 MoE server kills EngineCore). 0 disables +# prefetch (serialized fetches) and is usually too slow. run_trainer_and_export() { local url="$1" cvd="$2" # Optional multi-node trainer routing (see dispatch section). Defaults keep @@ -247,7 +250,8 @@ run_trainer_and_export() { data.streaming_server_url="$url" \ data.streaming_model_name="$HF_MODEL_CKPT" \ data.streaming_shared_storage_path="$SERVE_SCRATCH" \ - training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; } + training.dataloader_num_workers="${STREAMING_NUM_WORKERS:-4}" \ + || { echo "ERROR: trainer failed." >&2; return 1; } # Export only on the head trainer (machine_rank 0); non-head trainer nodes # would race writing the same export dir. The export reads the saved diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml index fb92ba11234..a2b44c0ea15 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -1,13 +1,25 @@ # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on -# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve + -# trainer split, same hardware reasoning — but trains a DFlash drafter instead of -# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the -# dflash recipe. +# GB200/Blackwell (HSG). Multi-node sibling of hf_streaming_dflash.yaml — same +# vLLM-serve + trainer split and same hardware reasoning, but scales the trainer +# across MULTIPLE nodes (1 serve node + N trainer nodes doing multi-node DDP) +# instead of the single trainer node in hf_streaming_dflash.yaml. # # Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native # NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, # ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4, -# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes. +# whole node), nodes 1..N = DFlash trainers (fake base), 4 GPUs each. This file +# allocates 3 nodes (1 serve + 2 trainers); bump slurm_config.nodes/segment to +# add more trainer nodes. +# +# Topology (see common/eagle3/train_eagle_streaming.sh header for the full +# dispatch): node 0 serves; the head trainer (Slurm node 1, accelerate +# machine_rank 0) publishes its IP via /scratchspace for accelerate's c10d +# rendezvous, and every trainer node reads both the serve address and the +# head-trainer address from /scratchspace. segment= pins all nodes into +# one NVL72 block so inter-node DDP traffic rides NVLink. NOTE: only global rank +# 0 fetches hidden states from the single serve and broadcasts them to the rest +# (DataLoaderDispatcher), so the single serve is the throughput ceiling — extra +# trainer nodes scale effective batch / compute, not data-production throughput. # # How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired # up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py @@ -39,7 +51,7 @@ # SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ # SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ # NEMORUN_HOME=$PWD -# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \ +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes # # The export lands in /scratchspace/export. To benchmark it, point From cac937c339b0e40779168f22e7c127e2c3f043d3 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 06:29:53 +0000 Subject: [PATCH 03/14] multi node training Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/launch_train.sh | 16 +++++++-- .../plugins/hf_streaming_dataset.py | 34 +++++++++++++++---- .../plugins/test_hf_streaming_dataset.py | 24 +++++++++++++ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh index fc623930767..3ff34b6ae9a 100755 --- a/examples/speculative_decoding/launch_train.sh +++ b/examples/speculative_decoding/launch_train.sh @@ -65,10 +65,22 @@ if [[ "$NUM_NODES" != "1" ]]; then # SLURM allocation reserves node 0 for something else, e.g. the streaming # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank). # Default to $SLURM_PROCID for the all-nodes-are-trainers case. - MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \ + # Canonical accelerate multi-node launch for a fixed Slurm allocation: + # --multi_gpu + static rendezvous via main_process_ip/port (-> MASTER_ADDR/PORT). + # + # --multi_gpu is REQUIRED: with 1 GPU/node, each node's local process count is + # num_processes/num_machines = 1, and without --multi_gpu accelerate treats a + # single local process as non-distributed -- it never sets WORLD_SIZE/RANK or + # forms the process group, so every node trains the full dataset as its own + # world=1 (no hang, no real DDP). --multi_gpu forces DistributedType.MULTI_GPU + # so the nodes rendezvous into one world=$TOTAL_GPU group. + # + # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which + # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip. + MULTI_NODE_ARGS="--multi_gpu \ + --num_processes $TOTAL_GPU \ --num_machines $NUM_NODES \ --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \ - --rdzv_backend c10d \ --main_process_ip $HEAD_NODE_IP \ --main_process_port 29500" fi diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py index 6b050f00add..9c3655b76fe 100644 --- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py +++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py @@ -49,13 +49,14 @@ import contextlib import os +import time from pathlib import Path from typing import TypedDict import httpx import torch from pydantic import BaseModel, ConfigDict, Field, field_validator -from safetensors import safe_open +from safetensors import SafetensorError, safe_open from torch.utils.data import Dataset from transformers.trainer_pt_utils import LabelSmoother @@ -63,6 +64,13 @@ IGNORE_TOKEN_ID = LabelSmoother.ignore_index +# The vLLM connector writes the safetensors file asynchronously (writer thread pool) +# and returns its path before the write is durably visible, so an immediate read can +# race the writer. Retry the open with linear backoff until the file lands +# (worst case ~_READ_RETRIES * (_READ_RETRIES+1)/2 * _READ_BACKOFF s). +_READ_RETRIES = 10 +_READ_BACKOFF = 0.05 # seconds + def _tokenize_with_loss_mask( tokenizer, @@ -427,13 +435,25 @@ def _load_safetensors(path: str) -> tuple[torch.Tensor, torch.Tensor]: ``safe_open(..., framework="pt").get_tensor`` materializes an independent torch Tensor (not a view into the mmap'd file), so it is safe to unlink right after the ``with`` block exits. + + Retries past the writer race (see ``_READ_RETRIES``): a missing file means + the write hasn't started; a ``SafetensorError`` means it's mid-write. Both + clear once the writer finishes, so back off and retry before giving up. """ - with safe_open(path, framework="pt") as f: - token_ids = f.get_tensor("token_ids") - hidden_states = f.get_tensor("hidden_states") # [seq, n_layers, hidden] - with contextlib.suppress(OSError): - os.unlink(path) - return token_ids, hidden_states + for attempt in range(_READ_RETRIES): + try: + with safe_open(path, framework="pt") as f: + token_ids = f.get_tensor("token_ids") + hidden_states = f.get_tensor("hidden_states") # [seq, n_layers, hidden] + with contextlib.suppress(OSError): + os.unlink(path) + return token_ids, hidden_states + except (FileNotFoundError, SafetensorError): # noqa: PERF203 -- retry-on-race loop + if attempt == _READ_RETRIES - 1: + raise + time.sleep(_READ_BACKOFF * (attempt + 1)) + # Unreachable (the last attempt above re-raises); guards _READ_RETRIES < 1. + raise RuntimeError(f"_load_safetensors exhausted {_READ_RETRIES} retries for {path}") @staticmethod def _align_loss_mask(loss_mask: torch.Tensor, n: int) -> torch.Tensor: diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py index 4d094171967..a0a37bc7afa 100644 --- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py +++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py @@ -271,3 +271,27 @@ def handler(request: httpx.Request) -> httpx.Response: with pytest.raises(RuntimeError, match="no fetchable sample"): ds[0] assert forbidden.exists(), "rejected path must not be unlinked" + + +def test_load_safetensors_retries_past_writer_race(tmp_path, monkeypatch): + """The connector writes asynchronously, so an immediate read can race it; + _load_safetensors must retry past the transient FileNotFound/Safetensor error.""" + seq, n_layers, hidden = 4, 2, 8 + path = tmp_path / "late.safetensors" + _write_canned_safetensors(path, seq, n_layers, hidden) + + calls = {"n": 0} + real_safe_open = hf_streaming_dataset.safe_open + + def flaky_safe_open(p, framework): + calls["n"] += 1 + if calls["n"] < 3: # first 2 reads race the writer (file not ready yet) + raise FileNotFoundError(f"No such file or directory: {p}") + return real_safe_open(p, framework=framework) + + monkeypatch.setattr(hf_streaming_dataset, "safe_open", flaky_safe_open) + monkeypatch.setattr(hf_streaming_dataset.time, "sleep", lambda *_: None) # no real backoff + + token_ids, hidden_states = EagleVllmStreamingDataset._load_safetensors(str(path)) + assert calls["n"] == 3 + assert hidden_states.shape == (seq, n_layers, hidden) From 19551b621c1fbff3ee0f9182d8206a71e6f0b129 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:23:00 +0000 Subject: [PATCH 04/14] multinode serving Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../common/eagle3/train_eagle_streaming.sh | 145 ++++++++++-------- 1 file changed, 81 insertions(+), 64 deletions(-) diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index a65cbc2bc41..6f2875e525a 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -20,23 +20,24 @@ # dumping to disk. Sibling of train_eagle.sh. # # Topology is chosen automatically from the Slurm allocation (the launcher yaml's -# `nodes:` field); nemo_run runs this script once per node, so it branches on -# $SLURM_NODEID: -# nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of -# the local GPUs (original single-node behavior). -# nodes == 2 -> split: node 0 runs vllm serve on all its GPUs, node 1 runs -# the trainer on all its GPUs. Roles rendezvous through the -# shared /scratchspace mount (node 0 publishes its serve -# address; the trainer signals completion). -# nodes >= 3 -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1) -# doing multi-node DDP. The head trainer (node 1, accelerate -# machine_rank 0) publishes its IP for accelerate's c10d -# rendezvous; all trainer nodes read both the serve address and -# the head-trainer address from /scratchspace. NOTE: only global -# rank 0 fetches hidden states from the single serve and -# broadcasts to the rest (DataLoaderDispatcher), so the single -# serve is the throughput ceiling — adding trainer nodes scales -# effective batch / compute, not data-production throughput. +# `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it +# branches on $SLURM_NODEID: +# nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest +# of the local GPUs (original single-node behavior). +# nodes >= 2 -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent +# vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1 +# are trainers doing multi-node DDP. SERVE_NODES defaults to 1 +# (1 serve + N trainers). Rendezvous over the shared +# /scratchspace mount: each serve i publishes its address to +# .serve_addr.i; the head trainer (first trainer node, +# accelerate machine_rank 0) publishes its IP for accelerate's +# rendezvous; trainers collect every serve address. +# +# The streaming dataset is map-style: HF Trainer's DistributedSampler shards the +# corpus across all trainer ranks and each rank fetches ONLY its own shard, +# round-robin across the SERVE_NODES replicas (data.streaming_server_url is the +# comma-joined list). So trainer nodes scale effective batch / compute and +# distribute the reads; serve nodes scale data-production throughput (~K x). # # Env vars (required): # HF_MODEL_CKPT Target model path. Used by both vllm serve (as the @@ -48,6 +49,8 @@ # default = [1,17,32] -> capture = [2,18,33,36]. # # Env vars (optional): +# SERVE_NODES multi-node only: number of dedicated serve replica nodes +# (Slurm nodes 0..SERVE_NODES-1). default 1. # SERVE_PORT default 8765 # SERVE_GPU_MEM_UTIL default 0.4 (single-node) / 0.9 (multi-node serve node) # SERVE_READY_TIMEOUT seconds to wait for the server to come up. default 900 @@ -135,10 +138,15 @@ SCRIPT_ARGS=("$@") SERVE_PORT="${SERVE_PORT:-8765}" SERVE_READY_TIMEOUT="${SERVE_READY_TIMEOUT:-900}" +# Number of dedicated serve replica nodes (multi-node only). Default 1. +SERVE_NODES="${SERVE_NODES:-1}" +# All serve replicas share one scratch dir; per-request safetensors files are keyed +# by a unique vllm request id, so they don't collide across servers. SERVE_SCRATCH="/scratchspace/streaming_serve_scratch" -SERVE_LOG="/scratchspace/vllm_serve.log" -# Multi-node rendezvous over the shared /scratchspace mount (lustre, visible on -# every node): node 0 publishes its address here, node 1 signals completion here. +SERVE_LOG="/scratchspace/vllm_serve.log" # serve nodes override with a per-node path +# Rendezvous over the shared /scratchspace mount (lustre, visible on every node): +# each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer +# signals completion via DONE_FILE; trainers collect all serve addresses. SERVE_ADDR_FILE="/scratchspace/.serve_addr" DONE_FILE="/scratchspace/.training_done" SERVE_PID="" @@ -153,6 +161,18 @@ cleanup() { gpus_on_node() { nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n1; } +# Resolve a *routable* IP for this node (other nodes must be able to dial it). +# `hostname -I` can list a link-local (169.254.x) or loopback address first, so +# prefer the resolved Slurm node name, then the first non-loopback/non-link-local IP. +# $1 = optional override (e.g. SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP) +resolve_routable_ip() { + local ip="$1" + [ -z "$ip" ] && ip=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) + [ -z "$ip" ] && ip=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1) + [ -z "$ip" ] && ip=$(hostname -I | awk '{print $1}') + echo "$ip" +} + # Start vllm serve in the background. Sets SERVE_PID. # $1 = bind host $2 = tensor-parallel size $3 = CUDA_VISIBLE_DEVICES ("" -> all) launch_vllm() { @@ -306,71 +326,68 @@ PY wait_vllm_ready "http://${SERVE_HOST}:${SERVE_PORT}" || exit 1 run_trainer_and_export "http://${SERVE_HOST}:${SERVE_PORT}" "$TRAIN_GPUS" || exit 1 -elif [ "$NODEID" -eq 0 ]; then - # ----------------------- multi-node: serve node ------------------------ - SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}" # dedicated node -> use most of it - SERVE_TP="${SERVE_TP:-$(gpus_on_node)}" # default: all GPUs on this node - rm -f "$SERVE_ADDR_FILE" "$DONE_FILE" # clear stale rendezvous state +elif [ "$NODEID" -lt "$SERVE_NODES" ]; then + # ---------------------- multi-node: serve node(s) ---------------------- + # Slurm nodes 0..SERVE_NODES-1 each run an independent vllm serve replica on + # their whole node and publish their address to ${SERVE_ADDR_FILE}.${NODEID}. + SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}" # dedicated node -> use most of it + SERVE_TP="${SERVE_TP:-$(gpus_on_node)}" # default: all GPUs on this node + SERVE_LOG="/scratchspace/vllm_serve.${NODEID}.log" # per-node log (avoid collision) + rm -f "${SERVE_ADDR_FILE}.${NODEID}" # clear own stale address + [ "$NODEID" -eq 0 ] && rm -f "$DONE_FILE" # node 0 clears the shared sentinel once trap cleanup INT TERM EXIT launch_vllm "0.0.0.0" "$SERVE_TP" "" wait_vllm_ready "http://127.0.0.1:${SERVE_PORT}" || exit 1 - # Publish a *routable* address for the trainer node. `hostname -I` can list a - # link-local (169.254.x) or loopback address first, which is unreachable from - # the other node, so resolve the Slurm node name and fall back to the first - # non-link-local / non-loopback IP. - serve_addr="${SERVE_ADVERTISE_IP:-}" - if [ -z "$serve_addr" ]; then - serve_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) - fi - if [ -z "$serve_addr" ]; then - serve_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1) - fi - [ -z "$serve_addr" ] && serve_addr=$(hostname -I | awk '{print $1}') - echo "$serve_addr" > "$SERVE_ADDR_FILE" - echo "Serve node published ${serve_addr}; holding the server up until the trainer signals done..." + serve_addr=$(resolve_routable_ip "${SERVE_ADVERTISE_IP:-}") + echo "$serve_addr" > "${SERVE_ADDR_FILE}.${NODEID}" + echo "Serve node ${NODEID}/${SERVE_NODES} published ${serve_addr}; holding up until training signals done..." while [ ! -f "$DONE_FILE" ]; do sleep 10; done - echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)." + echo "Training-done sentinel seen; serve node ${NODEID} exiting (EXIT trap stops vllm)." -elif [ "$NODEID" -ge 1 ]; then +else # -------------------- multi-node: trainer node(s) ---------------------- - # Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which - # map to 0-based accelerate machine ranks (head trainer = SLURM node 1). - NUM_TRAINER_NODES=$(( NNODES - 1 )) - TRAINER_RANK=$(( NODEID - 1 )) + # Serve nodes are 0..SERVE_NODES-1; trainer nodes are SERVE_NODES..NNODES-1, + # mapping to 0-based accelerate machine ranks (head trainer = first trainer node). + NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES )) + TRAINER_RANK=$(( NODEID - SERVE_NODES )) TRAINER_ADDR_FILE="/scratchspace/.trainer_addr" - # Only the head trainer (rank 0) signals the serve node to release on exit; - # a non-head node exiting first must NOT tear the serve down early. + # Only the head trainer (rank 0) signals the serve nodes to release on exit; + # a non-head node exiting first must NOT tear the serves down early. if [ "$TRAINER_RANK" -eq 0 ]; then trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT rm -f "$TRAINER_ADDR_FILE" # clear stale rendezvous state fi - echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..." - for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do - [ -f "$SERVE_ADDR_FILE" ] && break - sleep 1 + # Collect every serve replica's address and build the comma-joined URL list the + # streaming dataset round-robins across (one fetch per worker, spread over serves). + echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for ${SERVE_NODES} serve address(es)..." + URLS="" + for ((s = 0; s < SERVE_NODES; s++)); do + af="${SERVE_ADDR_FILE}.${s}" + for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do + [ -f "$af" ] && break + sleep 1 + done + [ -f "$af" ] || { echo "ERROR: serve node ${s} never published its address." >&2; exit 1; } + surl="http://$(cat "$af"):${SERVE_PORT}" + wait_vllm_ready "$surl" || exit 1 + URLS="${URLS:+$URLS,}$surl" done - [ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; } - URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}" - wait_vllm_ready "$URL" || exit 1 + echo "Trainer rank ${TRAINER_RANK} using serve URLs: ${URLS}" if [ "$NUM_TRAINER_NODES" -le 1 ]; then - # Original 1-serve + 1-trainer topology: single-node DDP, unchanged. - run_trainer_and_export "$URL" "" || exit 1 + # 1 trainer node: single-node DDP (no accelerate multi-node routing). + run_trainer_and_export "$URLS" "" || exit 1 else # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's - # c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse - # the serve node's IP-resolution logic (avoid link-local / loopback). + # rendezvous (port 29500); all trainer nodes read it and join. if [ "$TRAINER_RANK" -eq 0 ]; then - head_addr="${TRAINER_ADVERTISE_IP:-}" - [ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) - [ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1) - [ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}') + head_addr=$(resolve_routable_ip "${TRAINER_ADVERTISE_IP:-}") echo "$head_addr" > "$TRAINER_ADDR_FILE" - echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous." + echo "Head trainer (rank 0) published ${head_addr} for accelerate rendezvous." else echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..." for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do @@ -380,7 +397,7 @@ elif [ "$NODEID" -ge 1 ]; then [ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; } fi HEAD_IP=$(cat "$TRAINER_ADDR_FILE") - run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1 + run_trainer_and_export "$URLS" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1 fi fi From c524abf1d33a5c6415fb571d68852daa30759563 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:24:09 +0000 Subject: [PATCH 05/14] qwen multinode streaming example Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../hf_streaming_eagle3_multi_node.yaml | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml new file mode 100644 index 00000000000..aac5b71ecdf --- /dev/null +++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml @@ -0,0 +1,112 @@ +# EAGLE3 streaming speculative decoding pipeline for Qwen3-8B — MULTI-NODE. +# +# Multi-node sibling of hf_streaming_eagle3.yaml. Both the serve and trainer sides +# scale across multiple nodes, with multiple GPUs per node. task_1 allocates 4 +# nodes x 2 GPUs: SERVE_NODES (=2) run independent vllm serve replicas (TP=2 each), +# the remaining 2 run multi-node-DDP trainers (2 GPUs each -> world_size=4). Tune +# the split via slurm_config.nodes/gpus_per_node + the SERVE_NODES env: +# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. +# +# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): +# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes +# K..N-1 are trainers. The head trainer publishes its IP for accelerate's +# rendezvous; every trainer reads all serve addresses and joins the DDP group. +# +# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler +# shards the corpus across ALL trainer ranks and each rank fetches ONLY its own +# shard, round-robin across the K serve replicas (data.streaming_server_url is the +# comma-joined list). Trainer nodes scale effective batch / compute and distribute +# the lustre reads; serve nodes scale data-production throughput (~K x), lifting +# the single-serve ceiling. +# +# 3-step pipeline: +# task_0: Build input conversations (jsonl) +# task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU) +# task_2: Benchmark — evaluate speculative decoding speedup via VLLM +# +# Usage: +# uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml --yes + +job_name: Qwen3-8B_EAGLE3_streaming_multi_node +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/Qwen/Qwen3-8B + + # Step 1: Build input conversations + task_0: + script: common/eagle3/make_dataset.sh + args: + - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml + - --full-conversations + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each). + # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); + # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. + # + # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32]; + # vllm capture ids are those shifted by +1, plus the final layer: + # [2, 18, 33] + [36] = [2, 18, 33, 36]. + task_1: + script: common/eagle3/train_eagle_streaming.sh + args: + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml + - model.model_name_or_path=<> + - data.mode=streaming + - data.data_path=/scratchspace/data/train.jsonl + - training.output_dir=/scratchspace/eagle3 + - training.training_seq_len=4096 + - training.disable_tqdm=true + - training.ar_validate_steps=500000 + - training.num_train_epochs=1 + - eagle.eagle_use_torch_compile=false + environment: + - HF_MODEL_CKPT: <> + # No spaces: nemo_run emits `export FOO=value` without quotes, so a + # space-separated value would be split by the shell. + - EAGLE_CAPTURE_IDS: "[2,18,33,36]" + # Each serve node has 2 GPUs -> TP=2. + - SERVE_TP: "2" + # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. + - SERVE_NODES: "2" + # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES. + # Here 4 ranks (2 nodes x 2 GPU) x 4 / 2 serves = 8 concurrent per serve — fine + # for Qwen's max_num_seqs. + - STREAMING_NUM_WORKERS: "4" + slurm_config: + _factory_: "slurm_factory" + nodes: 4 + ntasks_per_node: 1 + gpus_per_node: 2 + container: vllm/vllm-openai:latest + + # Step 3: Benchmark speculative decoding (VLLM backend) + task_2: + script: common/specdec_bench/quick_check.sh + args: + - --draft_model_dir /scratchspace/export + - --draft_length 3 + - --output_length 4096 + - --engine VLLM + - --tp_size 1 + - --ep_size 1 + - --speculative_algorithm EAGLE3 + - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl + - --concurrency 1 + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: vllm/vllm-openai:latest From 8eb3525a7f1b281e4aa742bbedb1bb33faa20221 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:23:38 +0000 Subject: [PATCH 06/14] add kimi example Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../Kimi-K2.5/hf_streaming_dflash.yaml | 15 ++-- .../hf_streaming_dflash_multi_node.yaml | 68 +++++++++++-------- .../Kimi-K2.5/hf_streaming_eagle3.yaml | 15 ++-- 3 files changed, 63 insertions(+), 35 deletions(-) diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml index ff99ae62c7f..d16ca3822c7 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml @@ -81,10 +81,6 @@ pipeline: - model.trust_remote_code=true - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4 - # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker - # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. - - data.streaming_prefetch=8 - training.output_dir=/scratchspace/dflash # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. - training.training_seq_len=4096 @@ -106,6 +102,12 @@ pipeline: # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" - SERVE_TP: "4" + # DataLoader workers per trainer rank = in-flight requests per rank. The + # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0), + # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS. + # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE + # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort. + - STREAMING_NUM_WORKERS: "1" # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's @@ -126,6 +128,11 @@ pipeline: slurm_config: _factory_: "slurm_factory" nodes: 2 + # Pin the serve node + trainer node into one NVL72 block. Inter-node here is + # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this + # is a latency/locality nicety rather than a correctness requirement (cf. the + # multi_node examples, where cross-node trainer DDP makes segment essential). + segment: 2 ntasks_per_node: 1 gpus_per_node: 4 container: vllm/vllm-openai:latest diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml index a2b44c0ea15..b645a0e428c 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -1,25 +1,35 @@ # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on -# GB200/Blackwell (HSG). Multi-node sibling of hf_streaming_dflash.yaml — same -# vLLM-serve + trainer split and same hardware reasoning, but scales the trainer -# across MULTIPLE nodes (1 serve node + N trainer nodes doing multi-node DDP) -# instead of the single trainer node in hf_streaming_dflash.yaml. +# GB200/Blackwell (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of +# hf_streaming_dflash.yaml: BOTH sides scale out. SERVE_NODES (=2) run independent +# vllm serve replicas (TP=4 each, whole node); the remaining nodes run multi-node-DDP +# trainers (4 GPUs each). This file allocates 4 nodes = 2 serve + 2 trainer +# (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES: +# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. # # Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native # NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, -# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4, -# whole node), nodes 1..N = DFlash trainers (fake base), 4 GPUs each. This file -# allocates 3 nodes (1 serve + 2 trainers); bump slurm_config.nodes/segment to -# add more trainer nodes. +# ~138 GB/GPU of weights) with NO cpu-offload. So each serve replica owns a whole +# node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base). # -# Topology (see common/eagle3/train_eagle_streaming.sh header for the full -# dispatch): node 0 serves; the head trainer (Slurm node 1, accelerate -# machine_rank 0) publishes its IP via /scratchspace for accelerate's c10d -# rendezvous, and every trainer node reads both the serve address and the -# head-trainer address from /scratchspace. segment= pins all nodes into -# one NVL72 block so inter-node DDP traffic rides NVLink. NOTE: only global rank -# 0 fetches hidden states from the single serve and broadcasts them to the rest -# (DataLoaderDispatcher), so the single serve is the throughput ceiling — extra -# trainer nodes scale effective batch / compute, not data-production throughput. +# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): +# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes +# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d +# rendezvous; every trainer reads all K serve addresses and joins the DDP group. +# segment= pins all nodes into one NVL72 block so inter-node DDP traffic +# rides NVLink. +# +# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF +# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each +# rank fetches ONLY its own shard, round-robin across the K serve replicas +# (data.streaming_server_url is the comma-joined list the script assembles). So +# trainer nodes scale effective batch / compute and distribute the lustre reads; +# serve nodes scale data-production throughput (~K x), lifting the single-serve +# ceiling that bounded the old single-serve multi-node path. +# +# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below): +# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES +# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server +# stalls a worker past vLLM's execute-model timeout and kills EngineCore. # # How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired # up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py @@ -81,9 +91,9 @@ pipeline: gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 - # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node. - # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh): - # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ. + # Step 2: Streaming DFlash training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each). + # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); + # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -93,16 +103,12 @@ pipeline: - model.trust_remote_code=true - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4 - # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker - # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. - - data.streaming_prefetch=8 - training.output_dir=/scratchspace/dflash # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. - training.training_seq_len=4096 - training.disable_tqdm=true - - training.num_train_epochs=1 - training.ar_validate_steps=500000 + - training.num_train_epochs=1 - training.max_steps=500 # See header: Kimi's template lacks {% generation %} tags; train on all tokens. - training.answer_only_loss=false @@ -118,7 +124,15 @@ pipeline: # No spaces in values: nemo_run emits `export FOO=value` unquoted. # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" + # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. + - SERVE_NODES: "2" + # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi). - SERVE_TP: "4" + # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1: + # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS. + # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker + # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. + - STREAMING_NUM_WORKERS: "1" # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's @@ -138,8 +152,8 @@ pipeline: - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: _factory_: "slurm_factory" - nodes: 3 - segment: 3 + nodes: 4 + segment: 4 ntasks_per_node: 1 gpus_per_node: 4 container: vllm/vllm-openai:latest diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml index 24487ab8621..9ed18150869 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml @@ -55,10 +55,6 @@ pipeline: - model.trust_remote_code=true - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4 - # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker - # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. - - data.streaming_prefetch=8 - training.output_dir=/scratchspace/eagle3 - training.training_seq_len=4096 - training.disable_tqdm=true @@ -71,6 +67,12 @@ pipeline: # No spaces in values: nemo_run emits `export FOO=value` unquoted. - EAGLE_CAPTURE_IDS: "[2,30,58,60]" - SERVE_TP: "4" + # DataLoader workers per trainer rank = in-flight requests per rank. The + # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0), + # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS. + # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE + # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort. + - STREAMING_NUM_WORKERS: "1" # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with # 183/184 GB used). Cap context to the training seq len and leave headroom @@ -90,6 +92,11 @@ pipeline: slurm_config: _factory_: "slurm_factory" nodes: 2 + # Pin the serve node + trainer node into one NVL72 block. Inter-node here is + # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this + # is a latency/locality nicety rather than a correctness requirement (cf. the + # multi_node examples, where cross-node trainer DDP makes segment essential). + segment: 2 ntasks_per_node: 1 gpus_per_node: 4 container: vllm/vllm-openai:latest From 02656249872cf7d5f42431dda0916caec2ea9e24 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 3 Jun 2026 10:24:52 +0000 Subject: [PATCH 07/14] add k25 eagle3 multinode straming Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../hf_streaming_eagle3_multi_node.yaml | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml new file mode 100644 index 00000000000..e0c32debb0e --- /dev/null +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml @@ -0,0 +1,151 @@ +# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell +# (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of hf_streaming_eagle3.yaml: +# BOTH sides scale out. SERVE_NODES (=2) run independent vllm serve replicas (TP=4 +# each, whole node); the remaining nodes run multi-node-DDP trainers (4 GPUs each). +# This file allocates 4 nodes = 2 serve + 2 trainer (world_size=8). Tune the split +# via slurm_config.nodes/segment + SERVE_NODES: +# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. +# +# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native +# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, +# ~138 GB/GPU of weights) with NO cpu-offload. On CW H100 the model needed +# cpu-offload (-> ~1 tok/s -> vLLM EngineCore TimeoutError), so GB200 is the +# working path. Each serve replica owns a whole node at TP=4; each trainer node +# uses all 4 GPUs for the draft (fake base). +# +# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): +# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes +# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d +# rendezvous; every trainer reads all K serve addresses and joins the DDP group. +# segment= pins all nodes into one NVL72 block so inter-node DDP traffic +# rides NVLink. +# +# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF +# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each +# rank fetches ONLY its own shard, round-robin across the K serve replicas +# (data.streaming_server_url is the comma-joined list the script assembles). So +# trainer nodes scale effective batch / compute and distribute the lustre reads; +# serve nodes scale data-production throughput (~K x), lifting the single-serve +# ceiling. +# +# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below): +# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES +# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server +# stalls a worker past vLLM's execute-model timeout and kills EngineCore. +# +# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed +# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60. +# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base. +# +# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# SLURM_PARTITION=batch \ +# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ +# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# NEMORUN_HOME=$PWD +# uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml \ +# identity=$HOME/.ssh/id_ecdsa detach=True --yes + +job_name: Kimi-K2.5-NVFP4_EAGLE3_streaming_multi_node +pipeline: + allow_to_fail: false + skip: false + note: + + global_vars: + hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 + + # Step 1: Build input conversations (model-agnostic) + task_0: + script: common/eagle3/make_dataset.sh + args: + - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml + - --full-conversations + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), + # so request 4 even though make_dataset is CPU-only. + gpus_per_node: 4 + container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 + + # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each). + # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); + # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. + task_1: + script: common/eagle3/train_eagle_streaming.sh + args: + - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml + - model.model_name_or_path=<> + - model.use_fake_base_for_offline=true + - model.trust_remote_code=true + - data.mode=streaming + - data.data_path=/scratchspace/data/train.jsonl + - training.output_dir=/scratchspace/eagle3 + - training.training_seq_len=4096 + - training.disable_tqdm=true + - training.ar_validate_steps=500000 + - training.num_train_epochs=1 + - training.max_steps=500 + - eagle.eagle_use_torch_compile=false + environment: + - HF_MODEL_CKPT: <> + # No spaces in values: nemo_run emits `export FOO=value` unquoted. + - EAGLE_CAPTURE_IDS: "[2,30,58,60]" + # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. + - SERVE_NODES: "2" + # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi). + - SERVE_TP: "4" + # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1: + # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS. + # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker + # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. + - STREAMING_NUM_WORKERS: "1" + # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's + # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with + # 183/184 GB used). Cap context to the training seq len and leave headroom + # for activation spikes during the profiling forward. + - SERVE_MAX_MODEL_LEN: "4096" + # Small batches: smaller per-step MoE compute stays under the engine timeout. + - SERVE_MAX_NUM_SEQS: "4" + - SERVE_GPU_MEM_UTIL: "0.8" + - SERVE_READY_TIMEOUT: "2400" + - SERVE_EXTRA_ARGS: "--trust-remote-code" + # The killer was "RPC call to sample_tokens timed out" — a worker stalls on + # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default + # execute-model timeout, so EngineCore dies. Extend the timeouts that govern + # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" + - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" + slurm_config: + _factory_: "slurm_factory" + nodes: 4 + segment: 4 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest + + # Step 3: Benchmark speculative decoding (VLLM backend, Kimi served at TP=4) + task_2: + script: common/specdec_bench/quick_check.sh + args: + - --draft_model_dir /scratchspace/export + - --draft_length 3 + - --output_length 4096 + - --engine VLLM + - --tp_size 4 + - --ep_size 1 + - --speculative_algorithm EAGLE3 + - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl + - --concurrency 32 + # Kimi has custom modeling code; bench run.py loads base+tokenizer and needs this. + - --trust_remote_code + environment: + - HF_MODEL_CKPT: <> + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:latest From fc393663cd39475ac6114e20ad813191936d74bb Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:57:20 +0000 Subject: [PATCH 08/14] address comment Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/launch_train.sh | 26 +++++++++++++------ .../common/eagle3/train_eagle_streaming.sh | 19 +++++++++++--- tools/launcher/core.py | 13 +++++++--- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh index 3ff34b6ae9a..cd890c72053 100755 --- a/examples/speculative_decoding/launch_train.sh +++ b/examples/speculative_decoding/launch_train.sh @@ -59,7 +59,7 @@ else fi # Multi-node routing args (accelerate only; training config comes from the YAML) -MULTI_NODE_ARGS="" +MULTI_NODE_ARGS=() if [[ "$NUM_NODES" != "1" ]]; then # machine_rank: caller may pass --machine_rank explicitly (needed when the # SLURM allocation reserves node 0 for something else, e.g. the streaming @@ -77,17 +77,27 @@ if [[ "$NUM_NODES" != "1" ]]; then # # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip. - MULTI_NODE_ARGS="--multi_gpu \ - --num_processes $TOTAL_GPU \ - --num_machines $NUM_NODES \ - --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \ - --main_process_ip $HEAD_NODE_IP \ - --main_process_port 29500" + MULTI_NODE_ARGS=( + --multi_gpu + --num_processes "$TOTAL_GPU" + --num_machines "$NUM_NODES" + --machine_rank "${MACHINE_RANK:-$SLURM_PROCID}" + --main_process_ip "$HEAD_NODE_IP" + --main_process_port 29500 + ) fi export TOKENIZERS_PARALLELISM=False +# Build the argv directly (no `sh -c`): a re-parsed command string would word-split +# overrides that contain spaces (e.g. training.output_dir=/tmp/has space) and would +# execute command substitutions embedded in override values. An array preserves each +# argument boundary verbatim. +CMD=(accelerate launch --mixed_precision bf16 + "${MULTI_NODE_ARGS[@]}" + "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}") + set -x start_time=$(date +%s) -sh -c "accelerate launch --mixed_precision bf16 $MULTI_NODE_ARGS ${SCRIPT_DIR}/main.py --config $CONFIG_FILE ${EXTRA_ARGS[*]}" +"${CMD[@]}" echo "Total time: $(( $(date +%s) - $start_time )) seconds" diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index 6f2875e525a..b3637b1621d 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -147,8 +147,13 @@ SERVE_LOG="/scratchspace/vllm_serve.log" # serve nodes override with a per-nod # Rendezvous over the shared /scratchspace mount (lustre, visible on every node): # each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer # signals completion via DONE_FILE; trainers collect all serve addresses. -SERVE_ADDR_FILE="/scratchspace/.serve_addr" -DONE_FILE="/scratchspace/.training_done" +# Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on +# the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is +# identical across every node of one allocation (so the namespacing is consistent) +# and unique across allocations; falls back to a fixed token off-Slurm (single run). +RUN_ID="${SLURM_JOB_ID:-local}" +SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}" +DONE_FILE="/scratchspace/.training_done.${RUN_ID}" SERVE_PID="" mkdir -p "$SERVE_SCRATCH" @@ -302,6 +307,14 @@ run_trainer_and_export() { NNODES="${SLURM_NNODES:-1}" NODEID="${SLURM_NODEID:-0}" +# Multi-node needs at least one trainer node: with SERVE_NODES >= NNODES every node +# takes the serve branch, so no trainer ever publishes the rendezvous address or the +# DONE_FILE and the serve nodes block forever. Reject it up front. +if [ "$NNODES" -gt 1 ] && [ "$SERVE_NODES" -ge "$NNODES" ]; then + echo "ERROR: SERVE_NODES ($SERVE_NODES) must be < SLURM_NNODES ($NNODES); need >=1 trainer node." >&2 + exit 1 +fi + if [ "$NNODES" -le 1 ]; then # ----------------------------- single node ----------------------------- SERVE_HOST="${SERVE_HOST:-127.0.0.1}" @@ -352,7 +365,7 @@ else # mapping to 0-based accelerate machine ranks (head trainer = first trainer node). NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES )) TRAINER_RANK=$(( NODEID - SERVE_NODES )) - TRAINER_ADDR_FILE="/scratchspace/.trainer_addr" + TRAINER_ADDR_FILE="/scratchspace/.trainer_addr.${RUN_ID}" # per-job (see RUN_ID) # Only the head trainer (rank 0) signals the serve nodes to release on exit; # a non-head node exiting first must NOT tear the serves down early. diff --git a/tools/launcher/core.py b/tools/launcher/core.py index f6ae6493af3..0639d2afac7 100644 --- a/tools/launcher/core.py +++ b/tools/launcher/core.py @@ -270,6 +270,15 @@ def build_slurm_executor( identity=identity, ) + # --segment=: pin all nodes into one topology block (one NVL72 / NVLink domain). + # getattr (not attribute access) keeps older/custom SlurmConfig types patched in via + # set_slurm_config_type that predate the `segment` field from raising AttributeError. + # None -> omit the kwarg entirely so the scheduler places freely (default behavior). + optional_kwargs = {} + segment = getattr(slurm_config, "segment", None) + if segment is not None: + optional_kwargs["segment"] = segment + executor = run.SlurmExecutor( account=slurm_config.account, partition=slurm_config.partition, @@ -286,9 +295,7 @@ def build_slurm_executor( retries=0, packager=packager, srun_args=slurm_config.srun_args, - # --segment=: pin all nodes into one topology block (one NVL72 / NVLink - # domain). None -> omitted, scheduler places freely (default behavior). - segment=slurm_config.segment, + **optional_kwargs, ) return executor From 9882ee0d2cd7eb21455076ce657b2c55ec7be4ac Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:30:27 +0000 Subject: [PATCH 09/14] address comments Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- modelopt/recipe/config.py | 12 +++ modelopt/torch/speculative/config.py | 12 +++ .../plugins/hf_streaming_dataset.py | 21 +++++- .../speculative/plugins/hf_training_args.py | 2 + .../plugins/test_hf_streaming_dataset.py | 73 ++++++++++++++++++- tools/launcher/core.py | 20 +++++ .../moonshotai/Kimi-K2.5/specdec_bench.yaml | 11 ++- tools/launcher/slurm_config.py | 2 + 8 files changed, 144 insertions(+), 9 deletions(-) diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py index 97d93bbafc6..0932095f6d4 100644 --- a/modelopt/recipe/config.py +++ b/modelopt/recipe/config.py @@ -31,6 +31,18 @@ TrainingArguments as SpecTrainingArgs, ) +__all__ = [ + "RECIPE_TYPE_TO_CLASS", + "ModelOptDFlashRecipe", + "ModelOptEagleRecipe", + "ModelOptMedusaRecipe", + "ModelOptPTQRecipe", + "ModelOptRecipeBase", + "ModelOptSpeculativeRecipeBase", + "RecipeMetadataConfig", + "RecipeType", +] + class RecipeType(str, Enum): """List of recipe types. See ``RECIPE_TYPE_TO_CLASS`` at the bottom for the schema mapping.""" diff --git a/modelopt/torch/speculative/config.py b/modelopt/torch/speculative/config.py index 23ad200b6e7..708deafc0d1 100644 --- a/modelopt/torch/speculative/config.py +++ b/modelopt/torch/speculative/config.py @@ -23,6 +23,18 @@ from .eagle.default_config import default_eagle_config, default_kimik2_eagle_config +__all__ = [ + "DFLASH_DEFAULT_CFG", + "EAGLE3_DEFAULT_CFG", + "EAGLE_MTP_DEFAULT_CFG", + "DFlashConfig", + "EagleConfig", + "MedusaConfig", + "eagle3_default_config", + "eagle_mtp_default_config", + "kimik2_eagle_default_config", +] + kimik2_eagle_default_config = deepcopy(default_kimik2_eagle_config) eagle3_default_config = deepcopy(default_eagle_config) diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py index 9c3655b76fe..65b2cd4f0d7 100644 --- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py +++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py @@ -62,6 +62,14 @@ from modelopt.torch.utils import print_rank_0, warn_rank_0 +__all__ = [ + "EagleFetchPayload", + "EagleVllmStreamingConfig", + "EagleVllmStreamingDataset", + "StreamingConfig", + "StreamingDataset", +] + IGNORE_TOKEN_ID = LabelSmoother.ignore_index # The vLLM connector writes the safetensors file asynchronously (writer thread pool) @@ -71,6 +79,13 @@ _READ_RETRIES = 10 _READ_BACKOFF = 0.05 # seconds +# Errors from ``_fetch`` that are genuinely transient (server overloaded / connection +# reset / timeout, or the safetensors writer race) and so count against the circuit +# breaker and trigger a resample. Anything else -- notably the ``RuntimeError`` raised +# on server token drift, or a programming/contract bug (``ValueError``/``KeyError``) -- +# is a real fault and propagates instead of being silently masked as a fetch miss. +_TRANSIENT_FETCH_ERRORS = (httpx.HTTPError, OSError, SafetensorError) + def _tokenize_with_loss_mask( tokenizer, @@ -201,8 +216,10 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: continue # entry unfit pre-fetch; server not at fault, try the next one try: fetched = self._fetch(sample) - except Exception as e: - warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}") + except _TRANSIENT_FETCH_ERRORS as e: + # Transport/IO miss: count against the circuit breaker and resample. + # Contract violations and bugs are not caught here -- they propagate. + warn_rank_0(f"[streaming] fetch error for {sample['cid']}: {e!r}") fetched = None if fetched is None: self._consecutive_fail += 1 diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py index a9670ec1efd..2a9d4a1c4ff 100644 --- a/modelopt/torch/speculative/plugins/hf_training_args.py +++ b/modelopt/torch/speculative/plugins/hf_training_args.py @@ -33,6 +33,8 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +__all__ = ["DataArguments", "ModelArguments", "TrainingArguments"] + class ModelArguments(BaseModel): """Arguments for loading the base HF model.""" diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py index a0a37bc7afa..d4b910fe237 100644 --- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py +++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py @@ -85,8 +85,8 @@ def _format(self, fetched): def test_circuit_breaker_trips_on_consecutive_failures(): - """When _fetch keeps failing, __getitem__ raises after the threshold instead of - silently resampling the whole corpus.""" + """When _fetch keeps hitting transient errors (server down), __getitem__ raises + after the threshold instead of silently resampling the whole corpus.""" threshold = 3 class _AlwaysFails(StreamingDataset): @@ -94,7 +94,8 @@ def _tokenize_entry(self, entry): return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} def _fetch(self, sample): - raise RuntimeError("simulated server failure") + # A down server surfaces as a transport error, which the breaker counts. + raise httpx.ConnectError("simulated server down") ds = _AlwaysFails( _entries(20), @@ -105,6 +106,28 @@ def _fetch(self, sample): ds[0] +def test_contract_violation_propagates_not_swallowed(): + """A non-transient error from _fetch (e.g. a contract violation / bug) must + surface immediately, not be masked as a fetch miss and silently resampled.""" + + class _BadContract(StreamingDataset): + def _tokenize_entry(self, entry): + return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} + + def _fetch(self, sample): + raise RuntimeError("server token_ids drift") + + ds = _BadContract( + _entries(20), + tokenizer=MagicMock(), + # High threshold: if the error were (wrongly) swallowed, the breaker wouldn't + # fire, so a leaked breaker message would mask the regression. + config=StreamingConfig(fail_after_consecutive_skips=100), + ) + with pytest.raises(RuntimeError, match="server token_ids drift"): + ds[0] + + def test_fetch_returning_none_exhausts_then_raises(): """If every entry's fetch yields None (e.g. all rejected), __getitem__ raises a clear 'no fetchable sample' error rather than hanging or returning junk.""" @@ -238,6 +261,50 @@ def handler(request: httpx.Request) -> httpx.Response: assert list(scratch.iterdir()) == [], "scratch files must be unlinked after fetch" +def test_fetch_round_robins_across_server_urls(tmp_path, monkeypatch): + """With multiple server_urls, consecutive fetches alternate across endpoints so + load is spread over replicas rather than pinned to the first one.""" + seq, n_layers, hidden = 8, 3, 16 + scratch = tmp_path / "vllm_scratch" + scratch.mkdir() + + hosts: list[str] = [] + counter = {"n": 0} + + def handler(request: httpx.Request) -> httpx.Response: + hosts.append(request.url.host) + counter["n"] += 1 + path = scratch / f"req_{counter['n']}.safetensors" + _write_canned_safetensors(path, seq, n_layers, hidden) + return httpx.Response( + 200, + json={"kv_transfer_params": {"hidden_states_path": str(path)}}, + ) + + _patch_sync_client(monkeypatch, handler) + + n_entries = 4 + entries = [ + {"conversation_id": f"c-{i}", "messages": [{"role": "user", "content": "x"}]} + for i in range(n_entries) + ] + ds = EagleVllmStreamingDataset( + entries=entries, + tokenizer=_tokenizer_returning(seq), + config=EagleVllmStreamingConfig( + server_urls=["http://a:8000", "http://b:8000"], + model="mock-model", + shared_storage_root=str(scratch), + ), + ) + + for i in range(n_entries): + ds[i] + + # Per-process round-robin cursor: a, b, a, b -- one request each, alternating. + assert hosts == ["a", "b", "a", "b"] + + def test_path_outside_shared_storage_root_is_rejected(tmp_path, monkeypatch): """Out-of-root path from the server is not opened or unlinked; the fetch yields None, so the single-entry corpus is exhausted and __getitem__ raises.""" diff --git a/tools/launcher/core.py b/tools/launcher/core.py index 0639d2afac7..dcdd86aad40 100644 --- a/tools/launcher/core.py +++ b/tools/launcher/core.py @@ -28,6 +28,26 @@ import nemo_run as run import yaml +__all__ = [ + "DEFAULT_EXPERIMENT_TITLE", + "GlobalVariables", + "SandboxPipeline", + "SandboxTask", + "SandboxTask0", + "SandboxTask1", + "SandboxTask2", + "SandboxTask3", + "SandboxTask4", + "build_docker_executor", + "build_slurm_executor", + "create_task_from_yaml", + "get_default_env", + "register_factory", + "report_versions", + "run_jobs", + "set_slurm_config_type", +] + # --------------------------------------------------------------------------- # Default environment variables injected into every job # --------------------------------------------------------------------------- diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml index a943f39c27e..84a77217453 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml @@ -18,8 +18,9 @@ # yaml and add `- --runtime_params ` below — see # examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern). # - --draft_model_dir must point at a trained+exported HF-format DFLASH draft -# (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Edit the path -# below, or override on the CLI: pipeline.task_0.args[0]="--draft_model_dir /hf-local/" +# (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Set it via the +# `draft_model_dir` global_var below, or override on the CLI: +# pipeline.global_vars.draft_model_dir=/hf-local/ # - Kimi needs --trust_remote_code for both tokenizer and model. # # NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so @@ -53,12 +54,14 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 + # Trained + exported HF-format DFLASH draft checkpoint. Defaults to the standard + # export path; override on the CLI with: pipeline.global_vars.draft_model_dir= + draft_model_dir: /hf-local/nvidia/Kimi-K2.5-DFlash task_0: script: common/specdec_bench/run.sh args: - # TODO: point at your trained + exported HF-format DFLASH draft checkpoint. - - --draft_model_dir /hf-local/nvidia/Kimi-K2.5-DFlash + - --draft_model_dir <> - --speculative_algorithm DFLASH - --engine VLLM - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py index 0bcfff14ad9..9c3c853e877 100644 --- a/tools/launcher/slurm_config.py +++ b/tools/launcher/slurm_config.py @@ -24,6 +24,8 @@ import nemo_run as run +__all__ = ["SlurmConfig", "slurm_factory"] + @dataclass class SlurmConfig: From 9024ca92ddbefb649c7abcab628ab766ceaf7eb5 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:02:02 +0000 Subject: [PATCH 10/14] polish Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/eagle_utils.py | 3 +- examples/speculative_decoding/main.py | 15 +++--- .../speculative/plugins/hf_training_args.py | 6 +-- .../plugins/test_hf_streaming_dataset.py | 50 +++++++++++++++++++ .../Qwen/Qwen3-8B/hf_streaming_eagle3.yaml | 1 - 5 files changed, 59 insertions(+), 16 deletions(-) diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py index bcdcf15e8c2..f3ef93d740e 100644 --- a/examples/speculative_decoding/eagle_utils.py +++ b/examples/speculative_decoding/eagle_utils.py @@ -59,7 +59,6 @@ def make_speculative_data_module( train_len=None, answer_only_loss=False, shift_labels=True, - seed: int = 0, ) -> dict: """Create data module for speculative decoding training. @@ -90,7 +89,7 @@ def make_speculative_data_module( ds = ds.select(range(data_args.sample_size)) # Map-style dataset: each rank fetches its own DistributedSampler shard. # Fetch concurrency comes from the DataLoader's num_workers, not a config knob; - # shuffling/order is the sampler's job, so no seed is threaded here. + # shuffling/order is the sampler's job (seeded by training_args.seed). # ``server_urls`` accepts a comma-separated string for multi-server fan-out. streaming_cfg = EagleVllmStreamingConfig( server_urls=data_args.streaming_server_url, diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 4405bf0cd90..bbb6cbd478a 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -267,7 +267,6 @@ def train(): train_len=training_args.training_seq_len, answer_only_loss=training_args.answer_only_loss, shift_labels=not is_dflash, - seed=training_args.seed, ) callbacks = [EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)] @@ -277,13 +276,13 @@ def train(): and recipe.eagle.eagle_base_lora_warmup_steps > 0 ): callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps)) - if recipe.data.mode == "streaming": - # The streaming dataset is map-style, so HF Trainer's default resume would - # fast-forward by re-iterating (= re-fetching) every consumed batch just to - # discard it, hammering the server. Disable the data skip: on resume, weights/ - # optimizer/global_step still restore from the checkpoint; only the data order - # restarts from the top (acceptable for single-epoch streaming). - training_args.ignore_data_skip = True + # NB: do NOT set training_args.ignore_data_skip for streaming. The dataset is + # map-style, so HF Trainer's resume skip goes through accelerate.skip_first_batches, + # which drops the already-consumed indices at the batch-sampler level -- those + # indices never reach __getitem__, so no hidden states are re-fetched from the + # server. Resume therefore lands at the exact data position for free (correct even + # when a single epoch is split across many checkpointed segments). ignore_data_skip + # would instead restart the data order from the top, silently re-running data. trainer = EagleTrainerWithAccLog( model=model, diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py index 2a9d4a1c4ff..6f86a467ab2 100644 --- a/modelopt/torch/speculative/plugins/hf_training_args.py +++ b/modelopt/torch/speculative/plugins/hf_training_args.py @@ -31,7 +31,7 @@ from typing import Literal -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, field_validator, model_validator __all__ = ["DataArguments", "ModelArguments", "TrainingArguments"] @@ -64,10 +64,6 @@ class DataArguments(BaseModel): sample_size: int = -1 streaming_server_url: str | None = None streaming_model_name: str | None = None - # Deprecated / no-op: the streaming dataset is map-style now, so fetch concurrency - # comes from the DataLoader's ``dataloader_num_workers``, not this knob. Kept so - # existing yamls that set ``data.streaming_prefetch`` still validate. - streaming_prefetch: int = Field(default=64, ge=1) # Mirror of the vLLM connector's ``shared_storage_path``; trainer-side allowlist. streaming_shared_storage_path: str | None = None diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py index d4b910fe237..e6bac5b9755 100644 --- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py +++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py @@ -148,6 +148,56 @@ def _fetch(self, sample): ds[0] +def test_resume_skips_consumed_samples_without_refetching(): + """Map-style resume contract: HF Trainer skips consumed batches via + accelerate.skip_first_batches, which drops their indices at the batch-sampler + level so __getitem__ (and thus _fetch) is never called for them. This is why + main.py leaves ignore_data_skip at its default (False) for streaming -- resume + lands at the exact position with no re-fetch. Guards against a regression that + would re-fetch (or re-stream) already-consumed samples on resume.""" + pytest.importorskip("accelerate") + from accelerate import skip_first_batches + from torch.utils.data import DataLoader, RandomSampler + + fetched: list[int] = [] + + class _Recording(StreamingDataset): + def _tokenize_entry(self, entry): + return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None} + + def _fetch(self, sample): + cid = int(sample["cid"]) + fetched.append(cid) # stands in for the HTTP fetch + return {"cid": cid} + + def _format(self, payload): + return torch.tensor(payload["cid"]) + + n, batch_size, skip_batches = 20, 2, 3 + ds = _Recording(_entries(n), tokenizer=MagicMock(), config=StreamingConfig()) + + def make_dl(): + # Fresh, identically-seeded sampler -> identical permutation across runs. + return DataLoader( + ds, + batch_size=batch_size, + sampler=RandomSampler(ds, generator=torch.Generator().manual_seed(0)), + ) + + # Full pass -> ground-truth consumption order (cid == requested index here). + full_order = [int(x) for batch in make_dl() for x in batch] + fetched.clear() + + # Resume: skip the first `skip_batches` batches. + tail_order = [int(x) for batch in skip_first_batches(make_dl(), skip_batches) for x in batch] + + consumed = full_order[: skip_batches * batch_size] + expected_tail = full_order[skip_batches * batch_size :] + assert tail_order == expected_tail, "resume must continue at the exact data position" + assert set(fetched).isdisjoint(consumed), "skipped (consumed) samples must not be re-fetched" + assert fetched == expected_tail, "only the un-consumed tail is fetched after resume" + + def test_server_urls_normalization(): """server_urls accepts a single string, a comma-separated string, or a list, and strips trailing slashes.""" diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml index d93525632f3..91b8c54a8b5 100644 --- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml +++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml @@ -48,7 +48,6 @@ pipeline: - model.model_name_or_path=<> - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - - data.streaming_prefetch=64 - training.output_dir=/scratchspace/eagle3 - training.training_seq_len=4096 - training.disable_tqdm=true From 94d8bd4378118883b072f22a10df68e27c801f52 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:14:27 +0000 Subject: [PATCH 11/14] trim comments Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/launch_train.sh | 28 +++------ examples/speculative_decoding/main.py | 11 ++-- .../common/eagle3/train_eagle_streaming.sh | 53 +++++++--------- .../hf_streaming_eagle3_multi_node.yaml | 9 ++- .../Kimi-K2.5/hf_dflash_dryrun.yaml | 20 +++--- .../Kimi-K2.5/hf_streaming_dflash.yaml | 63 ++++++++----------- .../hf_streaming_dflash_multi_node.yaml | 56 +++++++---------- .../Kimi-K2.5/hf_streaming_eagle3.yaml | 13 ++-- .../hf_streaming_eagle3_multi_node.yaml | 22 +++---- .../moonshotai/Kimi-K2.5/specdec_bench.yaml | 6 +- 10 files changed, 115 insertions(+), 166 deletions(-) diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh index cd890c72053..fdcc123bb4a 100755 --- a/examples/speculative_decoding/launch_train.sh +++ b/examples/speculative_decoding/launch_train.sh @@ -61,22 +61,12 @@ fi # Multi-node routing args (accelerate only; training config comes from the YAML) MULTI_NODE_ARGS=() if [[ "$NUM_NODES" != "1" ]]; then - # machine_rank: caller may pass --machine_rank explicitly (needed when the - # SLURM allocation reserves node 0 for something else, e.g. the streaming - # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank). - # Default to $SLURM_PROCID for the all-nodes-are-trainers case. - # Canonical accelerate multi-node launch for a fixed Slurm allocation: - # --multi_gpu + static rendezvous via main_process_ip/port (-> MASTER_ADDR/PORT). - # - # --multi_gpu is REQUIRED: with 1 GPU/node, each node's local process count is - # num_processes/num_machines = 1, and without --multi_gpu accelerate treats a - # single local process as non-distributed -- it never sets WORLD_SIZE/RANK or - # forms the process group, so every node trains the full dataset as its own - # world=1 (no hang, no real DDP). --multi_gpu forces DistributedType.MULTI_GPU - # so the nodes rendezvous into one world=$TOTAL_GPU group. - # - # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which - # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip. + # machine_rank defaults to $SLURM_PROCID; pass --machine_rank explicitly when the + # allocation reserves node 0 for something else (e.g. a streaming vllm serve). + # --multi_gpu is required even at 1 GPU/node -- without it accelerate treats a lone + # local process as non-distributed and never forms the process group (each node + # would train its own world=1). Use static rendezvous via main_process_ip/port; NOT + # --rdzv_backend c10d, which switches to the elastic launcher and ignores it. MULTI_NODE_ARGS=( --multi_gpu --num_processes "$TOTAL_GPU" @@ -89,10 +79,8 @@ fi export TOKENIZERS_PARALLELISM=False -# Build the argv directly (no `sh -c`): a re-parsed command string would word-split -# overrides that contain spaces (e.g. training.output_dir=/tmp/has space) and would -# execute command substitutions embedded in override values. An array preserves each -# argument boundary verbatim. +# Run as an argv array (not `sh -c "..."`, which would word-split overrides +# containing spaces and execute command substitutions embedded in their values). CMD=(accelerate launch --mixed_precision bf16 "${MULTI_NODE_ARGS[@]}" "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}") diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index bbb6cbd478a..f62b099121d 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -276,13 +276,10 @@ def train(): and recipe.eagle.eagle_base_lora_warmup_steps > 0 ): callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps)) - # NB: do NOT set training_args.ignore_data_skip for streaming. The dataset is - # map-style, so HF Trainer's resume skip goes through accelerate.skip_first_batches, - # which drops the already-consumed indices at the batch-sampler level -- those - # indices never reach __getitem__, so no hidden states are re-fetched from the - # server. Resume therefore lands at the exact data position for free (correct even - # when a single epoch is split across many checkpointed segments). ignore_data_skip - # would instead restart the data order from the top, silently re-running data. + # Leave training_args.ignore_data_skip at its default (False). The dataset is + # map-style, so HF Trainer's resume skips consumed indices at the batch-sampler + # level (accelerate.skip_first_batches) without re-fetching them, landing at the + # exact data position. Setting it True would restart the data order from the top. trainer = EagleTrainerWithAccLog( model=model, diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index b3637b1621d..cd6bee89ac0 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -23,7 +23,7 @@ # `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it # branches on $SLURM_NODEID: # nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest -# of the local GPUs (original single-node behavior). +# of the local GPUs. # nodes >= 2 -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent # vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1 # are trainers doing multi-node DDP. SERVE_NODES defaults to 1 @@ -36,8 +36,8 @@ # The streaming dataset is map-style: HF Trainer's DistributedSampler shards the # corpus across all trainer ranks and each rank fetches ONLY its own shard, # round-robin across the SERVE_NODES replicas (data.streaming_server_url is the -# comma-joined list). So trainer nodes scale effective batch / compute and -# distribute the reads; serve nodes scale data-production throughput (~K x). +# comma-joined list). Trainer nodes scale compute and distribute the reads; serve +# nodes scale data-production throughput. # # Env vars (required): # HF_MODEL_CKPT Target model path. Used by both vllm serve (as the @@ -79,14 +79,12 @@ source "${SCRIPT_DIR}/../service_utils.sh" # Container provisioning # # vllm/vllm-openai:* has vllm and torch but not modelopt or the speculative -# trainer's deps. modelopt is bind-mounted at -# /usr/local/lib/python3.12/dist-packages/modelopt, but it has no .dist-info -# (so `importlib.metadata.version('nvidia-modelopt')` would fail). nemo_run -# only ships modelopt subdirs, not the real pyproject.toml, so we synthesize -# a minimal one with a correctly-scoped setuptools.packages.find include — -# without `include = ["modelopt*"]`, setuptools sees both `modelopt/` and -# `modelopt_recipes/` at the top level and refuses with a "flat-layout" -# error. We then `pip install -e .` to register the dist-info. +# trainer's deps. modelopt is bind-mounted but has no .dist-info (so +# `importlib.metadata.version('nvidia-modelopt')` would fail), and nemo_run does +# not ship the real pyproject.toml, so we synthesize a minimal one and +# `pip install -e .` to register the dist-info. The setuptools.packages.find +# `include` must be scoped (modelopt*, modelopt_recipes*) or setuptools sees two +# top-level packages and fails with a "flat-layout" error. TOML=modules/Model-Optimizer/pyproject.toml if [ ! -f "$TOML" ]; then @@ -144,13 +142,13 @@ SERVE_NODES="${SERVE_NODES:-1}" # by a unique vllm request id, so they don't collide across servers. SERVE_SCRATCH="/scratchspace/streaming_serve_scratch" SERVE_LOG="/scratchspace/vllm_serve.log" # serve nodes override with a per-node path -# Rendezvous over the shared /scratchspace mount (lustre, visible on every node): -# each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer -# signals completion via DONE_FILE; trainers collect all serve addresses. +# Rendezvous over the shared /scratchspace mount (visible on every node): each serve +# node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer signals +# completion via DONE_FILE; trainers collect all serve addresses. # Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on # the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is -# identical across every node of one allocation (so the namespacing is consistent) -# and unique across allocations; falls back to a fixed token off-Slurm (single run). +# identical across every node of one allocation and unique across allocations; falls +# back to a fixed token off-Slurm (single run). RUN_ID="${SLURM_JOB_ID:-local}" SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}" DONE_FILE="/scratchspace/.training_done.${RUN_ID}" @@ -199,11 +197,9 @@ launch_vllm() { # features skip recomputing cached/partial prefixes, which yields short or # empty hidden_states. Required, not optional. # --no-enable-flashinfer-autotune: on big NVFP4 MoE (Kimi) the flashinfer - # trtllm_fp4_block_scale_moe autotuner re-tunes on the first real serving - # step and stalls a worker past vLLM's execute-model timeout -> EngineCore - # dies with "RPC call to sample_tokens timed out" -> 500s -> trainer aborts. - # Disabling autotune keeps kernels static (and pairs with the larger - # VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS set in the example env). + # autotuner re-tunes on the first real serving step and stalls a worker past + # vLLM's execute-model timeout, killing EngineCore and aborting the trainer. + # Required there; keeps kernels static. "${gpu_env[@]}" vllm serve "$HF_MODEL_CKPT" \ --host "$bind_host" \ --port "$SERVE_PORT" \ @@ -249,22 +245,21 @@ wait_vllm_ready() { # Run the trainer then export the HF checkpoint. # $1 = streaming server base URL $2 = CUDA_VISIBLE_DEVICES ("" -> all) -# The streaming dataset is map-style now, so fetch concurrency comes from the -# DataLoader's workers (each worker = one in-flight request). STREAMING_NUM_WORKERS -# sets that; keep it modest so (ranks-per-server x workers) stays near the server's -# max_num_seqs (flooding a cold NVFP4 MoE server kills EngineCore). 0 disables -# prefetch (serialized fetches) and is usually too slow. +# Fetch concurrency comes from the DataLoader's workers (each worker = one in-flight +# request). STREAMING_NUM_WORKERS sets that; keep it modest so (ranks-per-server x +# workers) stays near the server's max_num_seqs (flooding a cold NVFP4 MoE server +# kills EngineCore). 0 disables prefetch (serialized fetches) and is usually too slow. run_trainer_and_export() { local url="$1" cvd="$2" - # Optional multi-node trainer routing (see dispatch section). Defaults keep - # the original single-trainer-node behavior: no --num_nodes, export on rank 0. + # Optional multi-node trainer routing (see dispatch section). Defaults: single + # trainer node, no --num_nodes, export on rank 0. local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}" echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..." # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all). local -a gpu_env=() [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") # Engage accelerate multi-node routing only when >1 trainer node; a single - # trainer node keeps the original invocation (no --num_nodes) verbatim. + # trainer node omits --num_nodes. local -a mn_args=() if [ "${num_tnodes}" -gt 1 ]; then mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank") diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml index aac5b71ecdf..d0c99f6f0be 100644 --- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml +++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml @@ -15,9 +15,8 @@ # How it scales: the dataset is map-style, so HF Trainer's DistributedSampler # shards the corpus across ALL trainer ranks and each rank fetches ONLY its own # shard, round-robin across the K serve replicas (data.streaming_server_url is the -# comma-joined list). Trainer nodes scale effective batch / compute and distribute -# the lustre reads; serve nodes scale data-production throughput (~K x), lifting -# the single-serve ceiling. +# comma-joined list). Trainer nodes scale effective batch / compute; serve nodes +# scale data-production throughput (~K x). # # 3-step pipeline: # task_0: Build input conversations (jsonl) @@ -79,8 +78,8 @@ pipeline: # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. - SERVE_NODES: "2" # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES. - # Here 4 ranks (2 nodes x 2 GPU) x 4 / 2 serves = 8 concurrent per serve — fine - # for Qwen's max_num_seqs. + # Here 4 ranks x 4 / 2 serves = 8 concurrent per serve — fine for Qwen's + # max_num_seqs. - STREAMING_NUM_WORKERS: "4" slurm_config: _factory_: "slurm_factory" diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml index b12c3b0f538..47ef2950b95 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml @@ -9,27 +9,25 @@ # the (untrained) ModelOpt checkpoint # to training.output_dir right after # mtsp.convert(model, [("dflash", ...)]) -# data.offline_data_path= → DataArguments derives data.mode from -# the data-source fields, so setting an -# offline path makes mode='offline' → -# use_offline_training=True. Combined -# with use_fake_base_for_offline=true -# this loads a FakeBaseModel (only +# data.offline_data_path= → setting an offline path makes +# mode='offline' → use_offline_training +# =True. Combined with +# use_fake_base_for_offline=true this +# loads a FakeBaseModel (only # embed_tokens + lm_head), so the ~1T # MoE base fits on a single GPU. The # file is never read in --dry_run mode. # model.trust_remote_code=true → Kimi-K2.5 (deepseek_v3 arch) ships a # custom modeling file -# dflash.dflash_mask_token_id=163838 → Kimi-K2.5 has no dedicated mask token -# ([EOS]=163585, [PAD]=163839); 163838 is -# a reserved slot used as the DFlash mask -# (matches the real Kimi-K2.5 DFlash run) +# dflash.dflash_mask_token_id=163838 → Kimi-K2.5 has no dedicated mask token; +# 163838 is a reserved slot used as the +# DFlash mask # # The dflash_online_training.sh export block then writes an HF-format DFlash draft # to /scratchspace/dflash/exported-checkpoint-final with the correct architecture # (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by # design. Useful for smoke-testing the launcher / convert / export plumbing and -# validating downstream loaders without paying for a real training run. +# validating downstream loaders without a real training run. # # Usage: # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml index d16ca3822c7..62e3c742e65 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml @@ -1,27 +1,23 @@ # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on # GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve + -# trainer split, same hardware reasoning — but trains a DFlash drafter instead of -# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the -# dflash recipe. +# trainer split, but trains a DFlash drafter instead of EAGLE3 by pointing the +# shared, algorithm-agnostic streaming script at the dflash recipe. # -# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native -# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, -# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4, -# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes. +# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 +# model fit at TP=4 on ONE 4-GPU node with no cpu-offload. Topology: node 0 = +# vllm serve (TP=4, whole node), node 1 = DFlash trainer (fake base), 2 nodes. # -# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired -# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py -# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets -# dflash_offline=True and the DFlash module consumes the streamed hidden states -# (base_model_outputs) instead of running the fake base. The vLLM connector, -# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps -# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST -# captured layer into base_model_hidden_states (used for DFlash self-logit -# distillation) and the REST into aux_hidden_states (DFlash's concatenated -# target-layer features). So n_captured must be (num DFlash target layers + 1). +# How streaming feeds DFlash: data.mode=streaming sets dflash_offline=True +# (derived in modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), +# so the DFlash module consumes the streamed hidden states (base_model_outputs) +# instead of running the fake base. vLLM dumps captured layers as +# [seq, n_captured, hidden]; the dataset splits the LAST captured layer into +# base_model_hidden_states (DFlash self-logit distillation) and the REST into +# aux_hidden_states (DFlash's concatenated target-layer features). So n_captured +# must be (num DFlash target layers + 1). # # Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; -# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3): +# the true final layer is NOT capturable so we use 60 as the base): # DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5) # = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) = # [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the @@ -30,8 +26,8 @@ # answer_only_loss: forced false here. DFlash's recipe default is true, which # requires the tokenizer chat template to carry {% generation %} tags so the # streaming dataset can derive an assistant-token mask; Kimi's template does not, -# and the streaming path (unlike online) does not inject data.chat_template. To -# train assistant-only later, supply a generation-tagged template and flip this on. +# and the streaming path does not inject data.chat_template. To train +# assistant-only later, supply a generation-tagged template and flip this on. # # Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): # export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ @@ -70,8 +66,7 @@ pipeline: container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node. - # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh): - # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ. + # Reuses the shared streaming orchestrator common/eagle3/train_eagle_streaming.sh. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -102,27 +97,24 @@ pipeline: # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" - SERVE_TP: "4" - # DataLoader workers per trainer rank = in-flight requests per rank. The - # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0), - # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS. - # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE - # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort. + # DataLoader workers per trainer rank = in-flight requests per rank. All + # trainer ranks fetch, so per-serve in-flight = trainer_world_size(4) x + # STREAMING_NUM_WORKERS. Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood + # stalls a cold NVFP4-MoE worker past vLLM's timeout and kills EngineCore. - STREAMING_NUM_WORKERS: "1" # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" - # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's - # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the - # training seq len and leave headroom for activation spikes. + # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context + # to the training seq len, leaving headroom for activation spikes. - SERVE_MAX_MODEL_LEN: "4096" # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # The killer was "RPC call to sample_tokens timed out" — a worker stalls on - # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default - # execute-model timeout, so EngineCore dies. Extend the timeouts that govern - # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + # A worker can stall on the first real serving step (cold NVFP4 MoE kernels) + # past vLLM's default execute-model timeout, killing EngineCore. Extend the + # timeouts (seconds) that govern that path. - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: @@ -130,8 +122,7 @@ pipeline: nodes: 2 # Pin the serve node + trainer node into one NVL72 block. Inter-node here is # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this - # is a latency/locality nicety rather than a correctness requirement (cf. the - # multi_node examples, where cross-node trainer DDP makes segment essential). + # is a latency/locality nicety rather than a correctness requirement. segment: 2 ntasks_per_node: 1 gpus_per_node: 4 diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml index b645a0e428c..703f636d53b 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -6,9 +6,8 @@ # (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES: # nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. # -# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native -# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, -# ~138 GB/GPU of weights) with NO cpu-offload. So each serve replica owns a whole +# Why GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 model fit +# at TP=4 on ONE 4-GPU node with no cpu-offload. So each serve replica owns a whole # node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base). # # Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): @@ -18,29 +17,24 @@ # segment= pins all nodes into one NVL72 block so inter-node DDP traffic # rides NVLink. # -# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF -# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each -# rank fetches ONLY its own shard, round-robin across the K serve replicas -# (data.streaming_server_url is the comma-joined list the script assembles). So -# trainer nodes scale effective batch / compute and distribute the lustre reads; -# serve nodes scale data-production throughput (~K x), lifting the single-serve -# ceiling that bounded the old single-serve multi-node path. +# How it scales: HF Trainer's DistributedSampler shards the corpus across ALL +# trainer ranks and each rank fetches ONLY its own shard, round-robin across the K +# serve replicas (data.streaming_server_url is the comma-joined list the script +# assembles). Trainer nodes scale effective batch / compute and distribute the +# lustre reads; serve nodes scale data-production throughput (~K x). # -# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below): -# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES -# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server -# stalls a worker past vLLM's execute-model timeout and kills EngineCore. +# Concurrency: keep it low for cold NVFP4 MoE (see SERVE_MAX_NUM_SEQS below). +# Per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES +# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. # -# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired -# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py -# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets -# dflash_offline=True and the DFlash module consumes the streamed hidden states -# (base_model_outputs) instead of running the fake base. The vLLM connector, -# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps -# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST -# captured layer into base_model_hidden_states (used for DFlash self-logit -# distillation) and the REST into aux_hidden_states (DFlash's concatenated -# target-layer features). So n_captured must be (num DFlash target layers + 1). +# How streaming feeds DFlash: data.mode=streaming derives dflash_offline=True +# (modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), so the +# DFlash module consumes the streamed hidden states (base_model_outputs) instead of +# running the fake base. vLLM dumps captured layers as [seq, n_captured, hidden]; +# the dataset splits the LAST captured layer into base_model_hidden_states (DFlash +# self-logit distillation) and the REST into aux_hidden_states (DFlash's +# concatenated target-layer features). So n_captured must be (num DFlash target +# layers + 1). # # Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; # the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3): @@ -130,24 +124,20 @@ pipeline: - SERVE_TP: "4" # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1: # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS. - # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker - # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. - STREAMING_NUM_WORKERS: "1" # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" - # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's - # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the - # training seq len and leave headroom for activation spikes. + # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context + # to the training seq len and leave headroom for activation spikes. - SERVE_MAX_MODEL_LEN: "4096" # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # The killer was "RPC call to sample_tokens timed out" — a worker stalls on - # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default - # execute-model timeout, so EngineCore dies. Extend the timeouts that govern - # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + # A worker can stall on the first real serving step (cold NVFP4 MoE kernels) + # past vLLM's default execute-model timeout, killing EngineCore. Extend the + # timeouts that govern that path (seconds). - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml index 9ed18150869..3f6cf23d1b7 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml @@ -1,12 +1,9 @@ -# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell -# (HSG). This is the streaming config that actually runs end-to-end: on CW H100 -# the ~551 GB model needed cpu-offload (-> ~1 tok/s -> vLLM EngineCore -# TimeoutError), so the working path is GB200. +# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell (HSG). # -# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native -# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, -# ~138 GB/GPU of weights) with NO cpu-offload. So here: node 0 = vllm serve -# (TP=4, whole node), node 1 = EAGLE3 trainer (fake base), 4 GPUs each, 2 nodes. +# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE +# 4-GPU node with no cpu-offload (on H100 it needs offload and is too slow to be +# usable). Topology: node 0 = vllm serve (TP=4, whole node), node 1 = EAGLE3 trainer +# (fake base); 4 GPUs each, 2 nodes. # # Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed # by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60. diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml index e0c32debb0e..bddf6b06909 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml @@ -6,12 +6,9 @@ # via slurm_config.nodes/segment + SERVE_NODES: # nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. # -# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native -# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB, -# ~138 GB/GPU of weights) with NO cpu-offload. On CW H100 the model needed -# cpu-offload (-> ~1 tok/s -> vLLM EngineCore TimeoutError), so GB200 is the -# working path. Each serve replica owns a whole node at TP=4; each trainer node -# uses all 4 GPUs for the draft (fake base). +# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE +# 4-GPU node with no cpu-offload. Each serve replica owns a whole node at TP=4; each +# trainer node uses all 4 GPUs for the draft (fake base). # # Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): # Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes @@ -20,13 +17,12 @@ # segment= pins all nodes into one NVL72 block so inter-node DDP traffic # rides NVLink. # -# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF -# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each -# rank fetches ONLY its own shard, round-robin across the K serve replicas -# (data.streaming_server_url is the comma-joined list the script assembles). So -# trainer nodes scale effective batch / compute and distribute the lustre reads; -# serve nodes scale data-production throughput (~K x), lifting the single-serve -# ceiling. +# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler shards +# the corpus across ALL trainer ranks and each rank fetches ONLY its own shard, +# round-robin across the K serve replicas (data.streaming_server_url is the +# comma-joined list the script assembles). So trainer nodes scale effective batch / +# compute and distribute the lustre reads; serve nodes scale data-production +# throughput (~K x), lifting the single-serve ceiling. # # Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below): # per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml index 84a77217453..b2eea8c1ec6 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml @@ -6,10 +6,8 @@ # aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl # to /scratchspace/specdec_bench/. # -# Hardware = GB200/Blackwell (HSG), same reasoning as hf_streaming_eagle3.yaml: -# Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the 192 GB/GPU of GB200; it fits -# at TP=4 on ONE 4-GPU node with no cpu-offload. On CW H100 it has no native FP4 -# and falls back to offload, so the working path is GB200. +# Requires GB200/Blackwell (HSG): Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the +# 192 GB/GPU of GB200 to fit at TP=4 on ONE 4-GPU node with no cpu-offload. # # DFLASH specifics: # - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8); From 221a3aed78270e4a5bffad9e976e43804b1789bd Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:27:49 +0000 Subject: [PATCH 12/14] trim comments Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/launch_train.sh | 18 +- .../common/eagle3/train_eagle_streaming.sh | 190 ++++++------------ tools/launcher/core.py | 24 +-- .../Qwen/Qwen3-8B/hf_streaming_eagle3.yaml | 23 +-- .../hf_streaming_eagle3_multi_node.yaml | 34 +--- .../Kimi-K2.5/hf_dflash_dryrun.yaml | 39 +--- .../Kimi-K2.5/hf_streaming_dflash.yaml | 78 ++----- .../hf_streaming_dflash_multi_node.yaml | 102 +++------- .../Kimi-K2.5/hf_streaming_eagle3.yaml | 48 ++--- .../hf_streaming_eagle3_multi_node.yaml | 74 ++----- .../moonshotai/Kimi-K2.5/specdec_bench.yaml | 55 ++--- 11 files changed, 186 insertions(+), 499 deletions(-) diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh index fdcc123bb4a..a6104f35fe6 100755 --- a/examples/speculative_decoding/launch_train.sh +++ b/examples/speculative_decoding/launch_train.sh @@ -19,9 +19,8 @@ # Multi-node: ./launch_train.sh --config ../../modelopt_recipes/general/speculative_decoding/eagle3.yaml --num_nodes 2 --head_node_ip # With overrides: ./launch_train.sh --config my.yaml model.model_name_or_path=xxx training.output_dir=yyy # -# Extra key=value args are forwarded as OmegaConf dotlist overrides to main.py. -# All training config (model, data, hyperparams, eagle, fsdp) lives in the YAML file. -# Only multi-node routing args are passed here; mixed_precision is fixed to bf16. +# Extra key=value args are forwarded as OmegaConf dotlist overrides to main.py; all +# training config lives in the YAML. mixed_precision is fixed to bf16. set -eo pipefail @@ -48,7 +47,6 @@ if [ -z "$CONFIG_FILE" ]; then exit 1 fi -# GPU count detection if [[ "$NUM_NODES" != "1" ]]; then GPU_PER_NODE=${GPU_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)} TOTAL_GPU=$((NUM_NODES * GPU_PER_NODE)) @@ -58,15 +56,10 @@ else echo "Total GPUs: $TOTAL_GPU (single node)" fi -# Multi-node routing args (accelerate only; training config comes from the YAML) MULTI_NODE_ARGS=() if [[ "$NUM_NODES" != "1" ]]; then - # machine_rank defaults to $SLURM_PROCID; pass --machine_rank explicitly when the - # allocation reserves node 0 for something else (e.g. a streaming vllm serve). - # --multi_gpu is required even at 1 GPU/node -- without it accelerate treats a lone - # local process as non-distributed and never forms the process group (each node - # would train its own world=1). Use static rendezvous via main_process_ip/port; NOT - # --rdzv_backend c10d, which switches to the elastic launcher and ignores it. + # --multi_gpu is required even at 1 GPU/node, else accelerate won't form the DDP group. + # machine_rank defaults to $SLURM_PROCID; override --machine_rank if node 0 isn't a trainer. MULTI_NODE_ARGS=( --multi_gpu --num_processes "$TOTAL_GPU" @@ -79,8 +72,7 @@ fi export TOKENIZERS_PARALLELISM=False -# Run as an argv array (not `sh -c "..."`, which would word-split overrides -# containing spaces and execute command substitutions embedded in their values). +# argv array, not `sh -c` (which would word-split overrides and run embedded substitutions). CMD=(accelerate launch --mixed_precision bf16 "${MULTI_NODE_ARGS[@]}" "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}") diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index cd6bee89ac0..6f9636c459a 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -15,76 +15,50 @@ # See the License for the specific language governing permissions and # limitations under the License. -# EAGLE3 streaming training: runs a `vllm serve` (KV-transfer producer of hidden -# states) alongside the trainer and routes hidden states over HTTP rather than -# dumping to disk. Sibling of train_eagle.sh. +# EAGLE3 streaming training: a `vllm serve` (KV-transfer hidden-states producer) +# runs alongside the trainer, routing hidden states over HTTP not disk. # -# Topology is chosen automatically from the Slurm allocation (the launcher yaml's -# `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it -# branches on $SLURM_NODEID: -# nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest -# of the local GPUs. -# nodes >= 2 -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent -# vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1 -# are trainers doing multi-node DDP. SERVE_NODES defaults to 1 -# (1 serve + N trainers). Rendezvous over the shared -# /scratchspace mount: each serve i publishes its address to -# .serve_addr.i; the head trainer (first trainer node, -# accelerate machine_rank 0) publishes its IP for accelerate's -# rendezvous; trainers collect every serve address. -# -# The streaming dataset is map-style: HF Trainer's DistributedSampler shards the -# corpus across all trainer ranks and each rank fetches ONLY its own shard, -# round-robin across the SERVE_NODES replicas (data.streaming_server_url is the -# comma-joined list). Trainer nodes scale compute and distribute the reads; serve -# nodes scale data-production throughput. +# CANONICAL TOPOLOGY/DISPATCH (per-example YAMLs cross-reference here). Topology is +# auto-chosen from the Slurm allocation (yaml `nodes:`) and $SERVE_NODES; nemo_run +# runs this script once per node, branching on $SLURM_NODEID: +# nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest. +# nodes >= 2 -> split: nodes 0..SERVE_NODES-1 each run an independent whole-node +# vllm serve replica; nodes SERVE_NODES..NNODES-1 are multi-node-DDP +# trainers. SERVE_NODES default 1. Rendezvous over shared +# /scratchspace: each serve i publishes .serve_addr.i; head trainer +# (first trainer node = accelerate machine_rank 0) publishes its IP; +# trainers collect every serve address. +# Map-style dataset: DistributedSampler shards the corpus across trainer ranks, each +# rank fetches only its shard round-robin across the SERVE_NODES replicas +# (data.streaming_server_url = comma-joined list). # # Env vars (required): -# HF_MODEL_CKPT Target model path. Used by both vllm serve (as the -# model arg, becomes the served-model-name) and the -# trainer (data.streaming_model_name). -# EAGLE_CAPTURE_IDS JSON list of 1-based layer ids vllm should capture. -# Must equal default_eagle_aux_layer_ids(L) shifted by +1, -# plus the final layer L. For Qwen3-8B (L=36): -# default = [1,17,32] -> capture = [2,18,33,36]. +# HF_MODEL_CKPT Target model path; vllm serve model arg (= served-model-name) +# and trainer data.streaming_model_name. +# EAGLE_CAPTURE_IDS JSON 1-based layer ids to capture = default_eagle_aux_layer_ids(L) +# +1, plus final layer L. Qwen3-8B (L=36): [1,17,32]->[2,18,33,36]. # # Env vars (optional): -# SERVE_NODES multi-node only: number of dedicated serve replica nodes -# (Slurm nodes 0..SERVE_NODES-1). default 1. -# SERVE_PORT default 8765 -# SERVE_GPU_MEM_UTIL default 0.4 (single-node) / 0.9 (multi-node serve node) -# SERVE_READY_TIMEOUT seconds to wait for the server to come up. default 900 -# SERVE_EXTRA_ARGS extra flags appended to `vllm serve` (e.g. --trust-remote-code) -# SERVE_CPU_OFFLOAD_GB GB of weights/GPU to offload to host RAM (fits big models -# on too-few GPUs; slower). e.g. "10" -# SERVE_MAX_MODEL_LEN cap vllm context length (trims KV/activation). e.g. "4096" -# SERVE_MAX_NUM_SEQS cap concurrent sequences (trims KV/activation). e.g. "8" -# SERVE_HOST single-node only: bind/connect host. default 127.0.0.1 -# SERVE_GPU single-node only: CUDA_VISIBLE_DEVICES for vllm. default "0" -# SERVE_TP tensor-parallel size. default 1 (single-node) / all GPUs -# on the serve node (multi-node) -# TRAIN_GPUS single-node only: CUDA_VISIBLE_DEVICES for the trainer. -# default = all local GPUs except SERVE_GPU. -# SERVE_ADVERTISE_IP multi-node only: address node 1 should dial. default is -# node 0's routable IP (its resolved Slurm node name, else -# its first non-loopback / non-link-local IP). -# -# All script args are forwarded to launch_train.sh (typically: --config -# plus OmegaConf dotlist overrides). +# SERVE_NODES multi-node: dedicated serve replica nodes (0..SERVE_NODES-1). default 1 +# SERVE_GPU_MEM_UTIL default 0.4 single-node / 0.9 multi-node serve node +# SERVE_READY_TIMEOUT server startup wait, seconds. default 900 +# SERVE_EXTRA_ARGS extra `vllm serve` flags (e.g. --trust-remote-code) +# SERVE_CPU_OFFLOAD_GB GB/GPU offloaded to host RAM (fits big models on too-few GPUs; slower) +# SERVE_MAX_MODEL_LEN cap context length (trims KV/activation) +# SERVE_MAX_NUM_SEQS cap concurrent sequences (trims KV/activation) +# SERVE_HOST single-node: bind/connect host. default 127.0.0.1 +# SERVE_GPU single-node: CUDA_VISIBLE_DEVICES for vllm. default "0" +# SERVE_TP tensor-parallel size. default 1 single-node / all serve-node GPUs +# TRAIN_GPUS single-node: trainer CUDA_VISIBLE_DEVICES. default = all but SERVE_GPU +# SERVE_ADVERTISE_IP multi-node: address node 1 dials. default node 0's routable IP SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" source "${SCRIPT_DIR}/../service_utils.sh" ################################################################################################### -# Container provisioning -# -# vllm/vllm-openai:* has vllm and torch but not modelopt or the speculative -# trainer's deps. modelopt is bind-mounted but has no .dist-info (so -# `importlib.metadata.version('nvidia-modelopt')` would fail), and nemo_run does -# not ship the real pyproject.toml, so we synthesize a minimal one and -# `pip install -e .` to register the dist-info. The setuptools.packages.find -# `include` must be scoped (modelopt*, modelopt_recipes*) or setuptools sees two -# top-level packages and fails with a "flat-layout" error. +# Container provisioning: the vllm image lacks modelopt's .dist-info and the real +# pyproject, so synthesize a minimal pyproject (scoped `include` avoids setuptools' +# flat-layout error) and `pip install -e .`. TOML=modules/Model-Optimizer/pyproject.toml if [ ! -f "$TOML" ]; then @@ -130,25 +104,18 @@ if [ -z "$EAGLE_CAPTURE_IDS" ]; then echo "ERROR: EAGLE_CAPTURE_IDS must be set (e.g. '[2, 18, 33, 36]' for Qwen3-8B)." >&2; exit 1 fi -# Everything passed to this script (--config + OmegaConf dotlist) is -# forwarded verbatim to the trainer. Capture it before the helpers below run. +# Forwarded verbatim to the trainer; capture before the helpers below run. SCRIPT_ARGS=("$@") SERVE_PORT="${SERVE_PORT:-8765}" SERVE_READY_TIMEOUT="${SERVE_READY_TIMEOUT:-900}" -# Number of dedicated serve replica nodes (multi-node only). Default 1. SERVE_NODES="${SERVE_NODES:-1}" -# All serve replicas share one scratch dir; per-request safetensors files are keyed -# by a unique vllm request id, so they don't collide across servers. +# Shared scratch; per-request safetensors keyed by vllm request id, so no collision. SERVE_SCRATCH="/scratchspace/streaming_serve_scratch" SERVE_LOG="/scratchspace/vllm_serve.log" # serve nodes override with a per-node path -# Rendezvous over the shared /scratchspace mount (visible on every node): each serve -# node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer signals -# completion via DONE_FILE; trainers collect all serve addresses. -# Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on -# the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is -# identical across every node of one allocation and unique across allocations; falls -# back to a fixed token off-Slurm (single run). +# Namespace rendezvous/sentinel files per Slurm job (SLURM_JOB_ID: same across an +# allocation's nodes, unique across allocations) so concurrent allocations on the +# shared mount don't clobber each other's addresses. Fixed token off-Slurm. RUN_ID="${SLURM_JOB_ID:-local}" SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}" DONE_FILE="/scratchspace/.training_done.${RUN_ID}" @@ -164,10 +131,9 @@ cleanup() { gpus_on_node() { nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n1; } -# Resolve a *routable* IP for this node (other nodes must be able to dial it). -# `hostname -I` can list a link-local (169.254.x) or loopback address first, so -# prefer the resolved Slurm node name, then the first non-loopback/non-link-local IP. -# $1 = optional override (e.g. SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP) +# Resolve a routable IP (other nodes must dial it). `hostname -I` can list a +# link-local/loopback first, so prefer the Slurm node name, then first non-lo/non-ll IP. +# $1 = optional override (SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP) resolve_routable_ip() { local ip="$1" [ -z "$ip" ] && ip=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) @@ -181,25 +147,16 @@ resolve_routable_ip() { launch_vllm() { local bind_host="$1" tp="$2" cvd="$3" echo "Launching vllm serve on ${bind_host}:${SERVE_PORT} (TP=${tp}, CUDA_VISIBLE_DEVICES=${cvd:-all}, mem=${SERVE_GPU_MEM_UTIL}, log: $SERVE_LOG)..." - # Only pin GPUs when a non-empty set is given; an empty CUDA_VISIBLE_DEVICES - # would expose *zero* GPUs (not all), so leave it unset to use the whole node. + # Pin GPUs only for a non-empty set; empty CUDA_VISIBLE_DEVICES hides ALL, so unset = whole node. local -a gpu_env=() [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") - # Optional single-value memory knobs (see header), assembled into --flag - # value pairs. Each is a space-free env value so it survives nemo_run's - # unquoted `export FOO=value`. + # Optional memory knobs (see header). Space-free env values to survive nemo_run's unquoted export. local -a opt_args=() [ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB") [ -n "${SERVE_MAX_MODEL_LEN:-}" ] && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN") [ -n "${SERVE_MAX_NUM_SEQS:-}" ] && opt_args+=(--max-num-seqs "$SERVE_MAX_NUM_SEQS") - # --no-enable-chunked-prefill / --no-enable-prefix-caching: the - # ExampleHiddenStatesConnector captures hidden states during prefill; both - # features skip recomputing cached/partial prefixes, which yields short or - # empty hidden_states. Required, not optional. - # --no-enable-flashinfer-autotune: on big NVFP4 MoE (Kimi) the flashinfer - # autotuner re-tunes on the first real serving step and stalls a worker past - # vLLM's execute-model timeout, killing EngineCore and aborting the trainer. - # Required there; keeps kernels static. + # --no-enable-chunked-prefill / --no-enable-prefix-caching: connector captures hidden states during prefill; both skip recomputing cached/partial prefixes, yielding short/empty hidden_states. Required. + # --no-enable-flashinfer-autotune: on NVFP4 MoE the autotuner re-tunes on the first serving step and stalls a worker past vLLM's execute-model timeout, killing EngineCore. "${gpu_env[@]}" vllm serve "$HF_MODEL_CKPT" \ --host "$bind_host" \ --port "$SERVE_PORT" \ @@ -245,21 +202,16 @@ wait_vllm_ready() { # Run the trainer then export the HF checkpoint. # $1 = streaming server base URL $2 = CUDA_VISIBLE_DEVICES ("" -> all) -# Fetch concurrency comes from the DataLoader's workers (each worker = one in-flight -# request). STREAMING_NUM_WORKERS sets that; keep it modest so (ranks-per-server x -# workers) stays near the server's max_num_seqs (flooding a cold NVFP4 MoE server -# kills EngineCore). 0 disables prefetch (serialized fetches) and is usually too slow. +# DataLoader workers = in-flight fetches per rank; keep modest so (ranks x workers) stays near the serve's max_num_seqs. run_trainer_and_export() { local url="$1" cvd="$2" - # Optional multi-node trainer routing (see dispatch section). Defaults: single - # trainer node, no --num_nodes, export on rank 0. + # Optional multi-node trainer routing (see dispatch). Defaults: 1 node, no --num_nodes, export on rank 0. local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}" echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..." - # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all). + # Empty cvd -> all GPUs (don't set the var; "" hides all). local -a gpu_env=() [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") - # Engage accelerate multi-node routing only when >1 trainer node; a single - # trainer node omits --num_nodes. + # accelerate multi-node routing only when >1 trainer node. local -a mn_args=() if [ "${num_tnodes}" -gt 1 ]; then mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank") @@ -273,19 +225,15 @@ run_trainer_and_export() { training.dataloader_num_workers="${STREAMING_NUM_WORKERS:-4}" \ || { echo "ERROR: trainer failed." >&2; return 1; } - # Export only on the head trainer (machine_rank 0); non-head trainer nodes - # would race writing the same export dir. The export reads the saved - # checkpoint (training.output_dir), not the serve, so it is serve-independent. + # Export only on the head trainer (machine_rank 0); non-head nodes would race the same export dir. Export reads training.output_dir, not the serve. if [ "${mrank}" -ne 0 ]; then echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)." return 0 fi - # Export the trained draft to HF format. Derive the checkpoint dir from the - # forwarded `training.output_dir=` dotlist (defaulting to the EAGLE - # convention) so EAGLE and DFlash runs each export their own output_dir. - # EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass - # --trust_remote_code; empty by default so EAGLE behavior is unchanged. + # Derive checkpoint dir from the forwarded training.output_dir= dotlist (EAGLE default) + # so EAGLE/DFlash runs each export their own dir. EXPORT_EXTRA_ARGS lets DFlash on a + # custom-modeling base (e.g. Kimi) pass --trust_remote_code; empty by default. local out_dir out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1) out_dir="${out_dir:-/scratchspace/eagle3}" @@ -295,16 +243,12 @@ run_trainer_and_export() { ${EXPORT_EXTRA_ARGS:-} } -# --------------------------------------------------------------------------- -# Topology dispatch (see header): nemo_run runs this script once per node, so -# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads. -# --------------------------------------------------------------------------- +# Topology dispatch (see header): branch on $SLURM_NNODES / $SLURM_NODEID. NNODES="${SLURM_NNODES:-1}" NODEID="${SLURM_NODEID:-0}" -# Multi-node needs at least one trainer node: with SERVE_NODES >= NNODES every node -# takes the serve branch, so no trainer ever publishes the rendezvous address or the -# DONE_FILE and the serve nodes block forever. Reject it up front. +# Need >=1 trainer node: with SERVE_NODES >= NNODES every node takes the serve branch, +# so nobody publishes the rendezvous/DONE_FILE and serve nodes block forever. if [ "$NNODES" -gt 1 ] && [ "$SERVE_NODES" -ge "$NNODES" ]; then echo "ERROR: SERVE_NODES ($SERVE_NODES) must be < SLURM_NNODES ($NNODES); need >=1 trainer node." >&2 exit 1 @@ -336,8 +280,7 @@ PY elif [ "$NODEID" -lt "$SERVE_NODES" ]; then # ---------------------- multi-node: serve node(s) ---------------------- - # Slurm nodes 0..SERVE_NODES-1 each run an independent vllm serve replica on - # their whole node and publish their address to ${SERVE_ADDR_FILE}.${NODEID}. + # Each runs a whole-node vllm serve replica and publishes ${SERVE_ADDR_FILE}.${NODEID}. SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}" # dedicated node -> use most of it SERVE_TP="${SERVE_TP:-$(gpus_on_node)}" # default: all GPUs on this node SERVE_LOG="/scratchspace/vllm_serve.${NODEID}.log" # per-node log (avoid collision) @@ -356,21 +299,19 @@ elif [ "$NODEID" -lt "$SERVE_NODES" ]; then else # -------------------- multi-node: trainer node(s) ---------------------- - # Serve nodes are 0..SERVE_NODES-1; trainer nodes are SERVE_NODES..NNODES-1, - # mapping to 0-based accelerate machine ranks (head trainer = first trainer node). + # Trainer nodes SERVE_NODES..NNODES-1 -> 0-based accelerate machine ranks. NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES )) TRAINER_RANK=$(( NODEID - SERVE_NODES )) TRAINER_ADDR_FILE="/scratchspace/.trainer_addr.${RUN_ID}" # per-job (see RUN_ID) - # Only the head trainer (rank 0) signals the serve nodes to release on exit; - # a non-head node exiting first must NOT tear the serves down early. + # Only head trainer (rank 0) signals serves to release on exit; a non-head node + # exiting first must NOT tear them down early. if [ "$TRAINER_RANK" -eq 0 ]; then trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT rm -f "$TRAINER_ADDR_FILE" # clear stale rendezvous state fi - # Collect every serve replica's address and build the comma-joined URL list the - # streaming dataset round-robins across (one fetch per worker, spread over serves). + # Collect serve addresses into the comma-joined URL list the dataset round-robins across. echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for ${SERVE_NODES} serve address(es)..." URLS="" for ((s = 0; s < SERVE_NODES; s++)); do @@ -387,11 +328,10 @@ else echo "Trainer rank ${TRAINER_RANK} using serve URLs: ${URLS}" if [ "$NUM_TRAINER_NODES" -le 1 ]; then - # 1 trainer node: single-node DDP (no accelerate multi-node routing). + # 1 trainer node: single-node DDP. run_trainer_and_export "$URLS" "" || exit 1 else - # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's - # rendezvous (port 29500); all trainer nodes read it and join. + # >1 trainer node: head publishes its routable IP for accelerate rendezvous (29500); all read and join. if [ "$TRAINER_RANK" -eq 0 ]; then head_addr=$(resolve_routable_ip "${TRAINER_ADVERTISE_IP:-}") echo "$head_addr" > "$TRAINER_ADDR_FILE" @@ -410,5 +350,3 @@ else fi ################################################################################################### - -#exit_handler $0 diff --git a/tools/launcher/core.py b/tools/launcher/core.py index dcdd86aad40..9154c1427bc 100644 --- a/tools/launcher/core.py +++ b/tools/launcher/core.py @@ -28,26 +28,6 @@ import nemo_run as run import yaml -__all__ = [ - "DEFAULT_EXPERIMENT_TITLE", - "GlobalVariables", - "SandboxPipeline", - "SandboxTask", - "SandboxTask0", - "SandboxTask1", - "SandboxTask2", - "SandboxTask3", - "SandboxTask4", - "build_docker_executor", - "build_slurm_executor", - "create_task_from_yaml", - "get_default_env", - "register_factory", - "report_versions", - "run_jobs", - "set_slurm_config_type", -] - # --------------------------------------------------------------------------- # Default environment variables injected into every job # --------------------------------------------------------------------------- @@ -277,8 +257,8 @@ def build_slurm_executor( # use a LocalTunnel: nemo_run then runs sbatch and copies artifacts via local # subprocess/shutil instead of ssh+rsync. This avoids flaky/hanging ssh-to- # localhost (e.g. MaxStartups throttling on a shared login node, or clusters - # like HSG that are only reachable through an sss proxy so paramiko can't - # tunnel in from outside). For real remote hosts, keep the SSHTunnel. + # only reachable through a login proxy so paramiko can't tunnel in from + # outside). For real remote hosts, keep the SSHTunnel. if slurm_config.host in ("localhost", "127.0.0.1"): tunnel = run.LocalTunnel(job_dir=job_dir) else: diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml index 91b8c54a8b5..d46e0eee68b 100644 --- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml +++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml @@ -2,14 +2,11 @@ # # 3-step pipeline: # task_0: Build input conversations (jsonl) -# task_1: Streaming train — vllm serve + trainer; hidden states are fetched -# per sample over HTTP (no on-disk dump) -# task_2: Benchmark — evaluate speculative decoding speedup via VLLM +# task_1: Streaming train — vllm serve + trainer; hidden states fetched over HTTP +# task_2: Benchmark — speculative decoding speedup via VLLM # -# task_1 here uses the multi-node split (nodes=2): node 0 runs vllm serve, node 1 -# runs the trainer; they rendezvous via the shared /scratchspace mount. (Set -# nodes=1 to co-locate both on one node instead.) All tasks share /scratchspace -# to pass artifacts between steps. +# task_1 uses nodes=2: node 0 runs vllm serve, node 1 the trainer. Tasks share +# /scratchspace to pass artifacts. # # Usage: # uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml --yes @@ -23,7 +20,6 @@ pipeline: global_vars: hf_model: /hf-local/Qwen/Qwen3-8B - # Step 1: Build input conversations task_0: script: common/eagle3/make_dataset.sh args: @@ -36,11 +32,8 @@ pipeline: gpus_per_node: 1 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 - # Step 2: Streaming EAGLE3 training - # - # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32]; - # vllm capture ids are those shifted by +1, plus the final layer: - # [2, 18, 33] + [36] = [2, 18, 33, 36]. + # capture ids = default_eagle_aux_layer_ids(36)=[1,17,32] shifted +1, plus final + # layer 36 -> [2,18,33,36]. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -56,8 +49,7 @@ pipeline: - eagle.eagle_use_torch_compile=false environment: - HF_MODEL_CKPT: <> - # No spaces: nemo_run emits `export FOO=value` without quotes, so a - # space-separated value would be split by the shell. + # No spaces: nemo_run emits unquoted `export FOO=value`, so spaces would split. - EAGLE_CAPTURE_IDS: "[2,18,33,36]" - SERVE_TP: "1" slurm_config: @@ -67,7 +59,6 @@ pipeline: gpus_per_node: 1 container: vllm/vllm-openai:latest - # Step 3: Benchmark speculative decoding (VLLM backend) task_2: script: common/specdec_bench/quick_check.sh args: diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml index d0c99f6f0be..3751ecbe96a 100644 --- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml +++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml @@ -1,22 +1,7 @@ # EAGLE3 streaming speculative decoding pipeline for Qwen3-8B — MULTI-NODE. # -# Multi-node sibling of hf_streaming_eagle3.yaml. Both the serve and trainer sides -# scale across multiple nodes, with multiple GPUs per node. task_1 allocates 4 -# nodes x 2 GPUs: SERVE_NODES (=2) run independent vllm serve replicas (TP=2 each), -# the remaining 2 run multi-node-DDP trainers (2 GPUs each -> world_size=4). Tune -# the split via slurm_config.nodes/gpus_per_node + the SERVE_NODES env: -# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. -# -# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): -# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes -# K..N-1 are trainers. The head trainer publishes its IP for accelerate's -# rendezvous; every trainer reads all serve addresses and joins the DDP group. -# -# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler -# shards the corpus across ALL trainer ranks and each rank fetches ONLY its own -# shard, round-robin across the K serve replicas (data.streaming_server_url is the -# comma-joined list). Trainer nodes scale effective batch / compute; serve nodes -# scale data-production throughput (~K x). +# task_1 splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES; +# see common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding. # # 3-step pipeline: # task_0: Build input conversations (jsonl) @@ -49,12 +34,7 @@ pipeline: container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each). - # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); - # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. - # - # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32]; - # vllm capture ids are those shifted by +1, plus the final layer: - # [2, 18, 33] + [36] = [2, 18, 33, 36]. + # Capture ids: default_eagle_aux_layer_ids(36)=[1,17,32] +1, plus final layer 36. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -70,16 +50,12 @@ pipeline: - eagle.eagle_use_torch_compile=false environment: - HF_MODEL_CKPT: <> - # No spaces: nemo_run emits `export FOO=value` without quotes, so a - # space-separated value would be split by the shell. + # No spaces: nemo_run emits `export FOO=value` unquoted. - EAGLE_CAPTURE_IDS: "[2,18,33,36]" - # Each serve node has 2 GPUs -> TP=2. - SERVE_TP: "2" # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. - SERVE_NODES: "2" - # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES. - # Here 4 ranks x 4 / 2 serves = 8 concurrent per serve — fine for Qwen's - # max_num_seqs. + # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore). - STREAMING_NUM_WORKERS: "4" slurm_config: _factory_: "slurm_factory" diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml index 47ef2950b95..5cb467b3f6a 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml @@ -1,33 +1,6 @@ -# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4). -# -# Single-task pipeline that exercises the full convert→save→export path WITHOUT -# actually training. Uses the same `common/specdec/dflash_online_training.sh` -# entrypoint as a real DFlash run; all dry-run behaviour is expressed as dotlist -# overrides on `main.py` (shared with EAGLE3 — `--dry_run` is mode-agnostic): -# -# --dry_run → main.py skips trainer.train(), saves -# the (untrained) ModelOpt checkpoint -# to training.output_dir right after -# mtsp.convert(model, [("dflash", ...)]) -# data.offline_data_path= → setting an offline path makes -# mode='offline' → use_offline_training -# =True. Combined with -# use_fake_base_for_offline=true this -# loads a FakeBaseModel (only -# embed_tokens + lm_head), so the ~1T -# MoE base fits on a single GPU. The -# file is never read in --dry_run mode. -# model.trust_remote_code=true → Kimi-K2.5 (deepseek_v3 arch) ships a -# custom modeling file -# dflash.dflash_mask_token_id=163838 → Kimi-K2.5 has no dedicated mask token; -# 163838 is a reserved slot used as the -# DFlash mask -# -# The dflash_online_training.sh export block then writes an HF-format DFlash draft -# to /scratchspace/dflash/exported-checkpoint-final with the correct architecture -# (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by -# design. Useful for smoke-testing the launcher / convert / export plumbing and -# validating downstream loaders without a real training run. +# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4): exercises the full +# convert->save->export path WITHOUT training, to validate launcher/export +# plumbing and downstream loaders. Exported draft has untrained weights. # # Usage: # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes @@ -41,18 +14,22 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4/ - # Convert → save → export (no training). + # Convert -> save -> export (no training). task_0: script: common/specdec/dflash_online_training.sh args: + # Skips trainer.train(), saves the untrained checkpoint right after convert. - --dry_run - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml - model.model_name_or_path=<> + # FakeBaseModel (embed + lm_head only) so the base fits one GPU; never read in dry-run. - model.use_fake_base_for_offline=true - model.trust_remote_code=true + # An offline path forces mode=offline; value unused in dry-run. - data.offline_data_path=/tmp/dryrun-placeholder - training.output_dir=/scratchspace/dflash - training.disable_tqdm=true + # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask. - dflash.dflash_mask_token_id=163838 slurm_config: _factory_: "slurm_factory" diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml index 62e3c742e65..50475b1a28e 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml @@ -1,39 +1,22 @@ # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on -# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve + -# trainer split, but trains a DFlash drafter instead of EAGLE3 by pointing the -# shared, algorithm-agnostic streaming script at the dflash recipe. +# GB200/Blackwell: node 0 = vllm serve (TP=4, whole node), node 1 = DFlash +# trainer. See common/eagle3/train_eagle_streaming.sh header for the mechanism. # -# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 -# model fit at TP=4 on ONE 4-GPU node with no cpu-offload. Topology: node 0 = -# vllm serve (TP=4, whole node), node 1 = DFlash trainer (fake base), 2 nodes. +# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi at TP=4 on one node. # -# How streaming feeds DFlash: data.mode=streaming sets dflash_offline=True -# (derived in modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), -# so the DFlash module consumes the streamed hidden states (base_model_outputs) -# instead of running the fake base. vLLM dumps captured layers as -# [seq, n_captured, hidden]; the dataset splits the LAST captured layer into -# base_model_hidden_states (DFlash self-logit distillation) and the REST into -# aux_hidden_states (DFlash's concatenated target-layer features). So n_captured -# must be (num DFlash target layers + 1). +# data.mode=streaming sets dflash_offline so the DFlash module consumes streamed +# hidden states instead of running the fake base. +# Capture ids = [2,16,31,45,59,60] (kimi_k25/deepseek_v3, 61 layers): 5 DFlash +# target layers + base 60. n_captured = num_target_layers + 1. # -# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; -# the true final layer is NOT capturable so we use 60 as the base): -# DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5) -# = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) = -# [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the -# dataset yields 5 aux layers, matching the 5-layer DFlash draft block. +# answer_only_loss forced false: Kimi's chat template lacks {% generation %} tags +# needed to derive the assistant-token mask; flip on with a tagged template. # -# answer_only_loss: forced false here. DFlash's recipe default is true, which -# requires the tokenizer chat template to carry {% generation %} tags so the -# streaming dataset can derive an assistant-token mask; Kimi's template does not, -# and the streaming path does not inject data.chat_template. To train -# assistant-only later, supply a generation-tagged template and flip this on. -# -# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): -# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT= \ # SLURM_PARTITION=batch \ -# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ -# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# SLURM_HF_LOCAL= \ +# SLURM_JOB_DIR= \ # NEMORUN_HOME=$PWD # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes @@ -50,7 +33,6 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 - # Step 1: Build input conversations (model-agnostic) task_0: script: common/eagle3/make_dataset.sh args: @@ -60,13 +42,10 @@ pipeline: _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 1 - # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), - # so request 4 even though make_dataset is CPU-only. + # The cluster QOS requires whole-node GPU allocation though make_dataset is CPU-only. gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 - # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node. - # Reuses the shared streaming orchestrator common/eagle3/train_eagle_streaming.sh. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -77,52 +56,39 @@ pipeline: - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - training.output_dir=/scratchspace/dflash - # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. + # Must be divisible by dflash_block_size (8). - training.training_seq_len=4096 - training.disable_tqdm=true - training.num_train_epochs=1 - training.max_steps=3000 - # See header: Kimi's template lacks {% generation %} tags; train on all tokens. + # Kimi's template lacks {% generation %} tags; train on all tokens (see header). - training.answer_only_loss=false - # dflash.yaml sets report_to=tensorboard, but the vLLM container has no - # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable - # reporting (loss still prints to stdout via logging_steps). + # vLLM container has no tensorboard (dflash.yaml's default report_to); disable. - training.report_to=none - # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838 - # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run). + # Kimi-K2.5 has no dedicated mask token; 163838 is a reserved slot used as one. - dflash.dflash_mask_token_id=163838 environment: - HF_MODEL_CKPT: <> # No spaces in values: nemo_run emits `export FOO=value` unquoted. - # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" - SERVE_TP: "4" - # DataLoader workers per trainer rank = in-flight requests per rank. All - # trainer ranks fetch, so per-serve in-flight = trainer_world_size(4) x - # STREAMING_NUM_WORKERS. Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood - # stalls a cold NVFP4-MoE worker past vLLM's timeout and kills EngineCore. + # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore). - STREAMING_NUM_WORKERS: "1" # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" - # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context - # to the training seq len, leaving headroom for activation spikes. + # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4. - SERVE_MAX_MODEL_LEN: "4096" - # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # A worker can stall on the first real serving step (cold NVFP4 MoE kernels) - # past vLLM's default execute-model timeout, killing EngineCore. Extend the - # timeouts (seconds) that govern that path. + # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds). - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: _factory_: "slurm_factory" nodes: 2 - # Pin the serve node + trainer node into one NVL72 block. Inter-node here is - # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this - # is a latency/locality nicety rather than a correctness requirement. + # Pin nodes into one NVL72 block (latency/locality; inter-node is HTTP + lustre, not NCCL). segment: 2 ntasks_per_node: 1 gpus_per_node: 4 diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml index 703f636d53b..fedf729a038 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -1,59 +1,20 @@ # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on -# GB200/Blackwell (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of -# hf_streaming_dflash.yaml: BOTH sides scale out. SERVE_NODES (=2) run independent -# vllm serve replicas (TP=4 each, whole node); the remaining nodes run multi-node-DDP -# trainers (4 GPUs each). This file allocates 4 nodes = 2 serve + 2 trainer -# (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES: -# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. +# GB200/Blackwell — MULTI-NODE: both serve and trainer sides scale out. +# nodes=N, SERVE_NODES=K -> K serve replicas (TP=4, whole node) + (N-K) trainer +# nodes. See common/eagle3/train_eagle_streaming.sh for dispatch/sharding/scaling. # -# Why GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 model fit -# at TP=4 on ONE 4-GPU node with no cpu-offload. So each serve replica owns a whole -# node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base). +# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi-K2.5-NVFP4 at TP=4 +# on one 4-GPU node, so each serve replica owns a whole node. # -# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): -# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes -# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d -# rendezvous; every trainer reads all K serve addresses and joins the DDP group. -# segment= pins all nodes into one NVL72 block so inter-node DDP traffic -# rides NVLink. +# Capture ids: build_target_layer_ids(num_orig=61, num_draft=5)=[1,15,30,44,58] +# -> +1 for embedding = [2,16,31,45,59], append base 60 (final layer uncapturable). +# 6 captured = 5 aux layers, matching the 5-layer DFlash draft block. # -# How it scales: HF Trainer's DistributedSampler shards the corpus across ALL -# trainer ranks and each rank fetches ONLY its own shard, round-robin across the K -# serve replicas (data.streaming_server_url is the comma-joined list the script -# assembles). Trainer nodes scale effective batch / compute and distribute the -# lustre reads; serve nodes scale data-production throughput (~K x). -# -# Concurrency: keep it low for cold NVFP4 MoE (see SERVE_MAX_NUM_SEQS below). -# Per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES -# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. -# -# How streaming feeds DFlash: data.mode=streaming derives dflash_offline=True -# (modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), so the -# DFlash module consumes the streamed hidden states (base_model_outputs) instead of -# running the fake base. vLLM dumps captured layers as [seq, n_captured, hidden]; -# the dataset splits the LAST captured layer into base_model_hidden_states (DFlash -# self-logit distillation) and the REST into aux_hidden_states (DFlash's -# concatenated target-layer features). So n_captured must be (num DFlash target -# layers + 1). -# -# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60; -# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3): -# DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5) -# = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) = -# [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the -# dataset yields 5 aux layers, matching the 5-layer DFlash draft block. -# -# answer_only_loss: forced false here. DFlash's recipe default is true, which -# requires the tokenizer chat template to carry {% generation %} tags so the -# streaming dataset can derive an assistant-token mask; Kimi's template does not, -# and the streaming path (unlike online) does not inject data.chat_template. To -# train assistant-only later, supply a generation-tagged template and flip this on. -# -# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): -# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT= \ # SLURM_PARTITION=batch \ -# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ -# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# SLURM_HF_LOCAL= \ +# SLURM_JOB_DIR= \ # NEMORUN_HOME=$PWD # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes @@ -70,7 +31,7 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 - # Step 1: Build input conversations (model-agnostic) + # Build input conversations. task_0: script: common/eagle3/make_dataset.sh args: @@ -80,14 +41,11 @@ pipeline: _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 1 - # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), - # so request 4 even though make_dataset is CPU-only. + # The cluster QOS requires whole-node GPU alloc even though make_dataset is CPU-only. gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 - # Step 2: Streaming DFlash training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each). - # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); - # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. + # Streaming DFlash training: 2 serve replicas (TP=4) + 2 trainer nodes. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -98,51 +56,41 @@ pipeline: - data.mode=streaming - data.data_path=/scratchspace/data/train.jsonl - training.output_dir=/scratchspace/dflash - # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks. + # Must be divisible by dflash_block_size (8). - training.training_seq_len=4096 - training.disable_tqdm=true - training.ar_validate_steps=500000 - training.num_train_epochs=1 - training.max_steps=500 - # See header: Kimi's template lacks {% generation %} tags; train on all tokens. + # Kimi's template lacks {% generation %} tags, so train on all tokens. - training.answer_only_loss=false - # dflash.yaml sets report_to=tensorboard, but the vLLM container has no - # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable - # reporting (loss still prints to stdout via logging_steps). + # vLLM container has no tensorboard (dflash.yaml's default) -> init crash. - training.report_to=none - # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838 - # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run). + # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask. - dflash.dflash_mask_token_id=163838 environment: - HF_MODEL_CKPT: <> - # No spaces in values: nemo_run emits `export FOO=value` unquoted. - # DFlash target layers (vLLM-indexed) + base 60; see header for derivation. + # See header for derivation. - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]" - # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. - SERVE_NODES: "2" - # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi). - SERVE_TP: "4" - # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1: - # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS. + # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore). - STREAMING_NUM_WORKERS: "1" - # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export. + # Kimi's custom-modeling base needs --trust_remote_code at export. - EXPORT_EXTRA_ARGS: "--trust_remote_code" - # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context - # to the training seq len and leave headroom for activation spikes. + # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4. - SERVE_MAX_MODEL_LEN: "4096" - # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # A worker can stall on the first real serving step (cold NVFP4 MoE kernels) - # past vLLM's default execute-model timeout, killing EngineCore. Extend the - # timeouts that govern that path (seconds). + # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds). - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: _factory_: "slurm_factory" nodes: 4 + # Pin nodes into one NVL72 block (essential for cross-node trainer DDP). segment: 4 ntasks_per_node: 1 gpus_per_node: 4 diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml index 3f6cf23d1b7..5ace5e83847 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml @@ -1,19 +1,16 @@ -# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell (HSG). +# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell. # -# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE -# 4-GPU node with no cpu-offload (on H100 it needs offload and is too slow to be -# usable). Topology: node 0 = vllm serve (TP=4, whole node), node 1 = EAGLE3 trainer -# (fake base); 4 GPUs each, 2 nodes. +# Requires GB200: native NVFP4 + 192 GB/GPU fits the ~551 GB model at TP=4 on one node. +# node 0 = vllm serve (TP=4), node 1 = EAGLE3 trainer (fake base); 4 GPUs each. # -# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed -# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60. -# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base. +# Capture ids: deepseek_v3 arch, 61 layers, indexed by layer input (0..60); +# [2,30,58] aux + [60] base (final layer not capturable). # -# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): -# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT= \ # SLURM_PARTITION=batch \ -# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ -# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# SLURM_HF_LOCAL= \ +# SLURM_JOB_DIR= \ # NEMORUN_HOME=$PWD # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes @@ -37,8 +34,7 @@ pipeline: _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 1 - # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), - # so request 4 even though make_dataset is CPU-only. + # The cluster QOS requires whole-node GPU alloc (4) even though make_dataset is CPU-only. gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 @@ -61,38 +57,24 @@ pipeline: - eagle.eagle_use_torch_compile=false environment: - HF_MODEL_CKPT: <> - # No spaces in values: nemo_run emits `export FOO=value` unquoted. + # No spaces: nemo_run emits `export FOO=value` unquoted. - EAGLE_CAPTURE_IDS: "[2,30,58,60]" - SERVE_TP: "4" - # DataLoader workers per trainer rank = in-flight requests per rank. The - # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0), - # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS. - # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE - # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort. + # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore). - STREAMING_NUM_WORKERS: "1" - # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's - # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with - # 183/184 GB used). Cap context to the training seq len and leave headroom - # for activation spikes during the profiling forward. + # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4. - SERVE_MAX_MODEL_LEN: "4096" - # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # The killer was "RPC call to sample_tokens timed out" — a worker stalls on - # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default - # execute-model timeout, so EngineCore dies. Extend the timeouts that govern - # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds). - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: _factory_: "slurm_factory" nodes: 2 - # Pin the serve node + trainer node into one NVL72 block. Inter-node here is - # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this - # is a latency/locality nicety rather than a correctness requirement (cf. the - # multi_node examples, where cross-node trainer DDP makes segment essential). + # Pin nodes into one NVL72 block (latency nicety here; essential when trainers do cross-node DDP). segment: 2 ntasks_per_node: 1 gpus_per_node: 4 diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml index bddf6b06909..e57c78f3cc1 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml @@ -1,43 +1,17 @@ # EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell -# (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of hf_streaming_eagle3.yaml: -# BOTH sides scale out. SERVE_NODES (=2) run independent vllm serve replicas (TP=4 -# each, whole node); the remaining nodes run multi-node-DDP trainers (4 GPUs each). -# This file allocates 4 nodes = 2 serve + 2 trainer (world_size=8). Tune the split -# via slurm_config.nodes/segment + SERVE_NODES: -# nodes=N, SERVE_NODES=K -> K serve replicas + (N-K) trainer nodes. +# MULTI-NODE: K serve replicas (TP=4, whole node) + (N-K) DDP trainer nodes. +# This file: nodes=4, SERVE_NODES=2 -> 2 serve + 2 trainer. See dispatch/scaling in +# common/eagle3/train_eagle_streaming.sh header. # -# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE -# 4-GPU node with no cpu-offload. Each serve replica owns a whole node at TP=4; each -# trainer node uses all 4 GPUs for the draft (fake base). +# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi at TP=4 on one node. +# Capture ids = [2,30,58] aux + [60] base = 4 (kimi_k25/deepseek_v3, 61 layers; +# layer 60 is the last capturable, used as base). # -# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch): -# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes -# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d -# rendezvous; every trainer reads all K serve addresses and joins the DDP group. -# segment= pins all nodes into one NVL72 block so inter-node DDP traffic -# rides NVLink. -# -# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler shards -# the corpus across ALL trainer ranks and each rank fetches ONLY its own shard, -# round-robin across the K serve replicas (data.streaming_server_url is the -# comma-joined list the script assembles). So trainer nodes scale effective batch / -# compute and distribute the lustre reads; serve nodes scale data-production -# throughput (~K x), lifting the single-serve ceiling. -# -# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below): -# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES -# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server -# stalls a worker past vLLM's execute-model timeout and kills EngineCore. -# -# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed -# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60. -# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base. -# -# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): -# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT= \ # SLURM_PARTITION=batch \ -# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ -# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# SLURM_HF_LOCAL= \ +# SLURM_JOB_DIR= \ # NEMORUN_HOME=$PWD # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes @@ -51,7 +25,6 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 - # Step 1: Build input conversations (model-agnostic) task_0: script: common/eagle3/make_dataset.sh args: @@ -61,14 +34,10 @@ pipeline: _factory_: "slurm_factory" nodes: 1 ntasks_per_node: 1 - # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200), - # so request 4 even though make_dataset is CPU-only. + # The cluster QOS requires whole-node GPU allocation though make_dataset is CPU-only. gpus_per_node: 4 container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10 - # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each). - # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh); - # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology. task_1: script: common/eagle3/train_eagle_streaming.sh args: @@ -89,40 +58,28 @@ pipeline: - HF_MODEL_CKPT: <> # No spaces in values: nemo_run emits `export FOO=value` unquoted. - EAGLE_CAPTURE_IDS: "[2,30,58,60]" - # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers. - SERVE_NODES: "2" - # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi). - SERVE_TP: "4" - # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1: - # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS. - # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker - # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort. + # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore). - STREAMING_NUM_WORKERS: "1" - # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's - # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with - # 183/184 GB used). Cap context to the training seq len and leave headroom - # for activation spikes during the profiling forward. + # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4. - SERVE_MAX_MODEL_LEN: "4096" - # Small batches: smaller per-step MoE compute stays under the engine timeout. - SERVE_MAX_NUM_SEQS: "4" - SERVE_GPU_MEM_UTIL: "0.8" - SERVE_READY_TIMEOUT: "2400" - SERVE_EXTRA_ARGS: "--trust-remote-code" - # The killer was "RPC call to sample_tokens timed out" — a worker stalls on - # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default - # execute-model timeout, so EngineCore dies. Extend the timeouts that govern - # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help. + # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds). - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200" - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200" slurm_config: _factory_: "slurm_factory" nodes: 4 + # Pin nodes into one NVL72 block (essential for cross-node trainer DDP). segment: 4 ntasks_per_node: 1 gpus_per_node: 4 container: vllm/vllm-openai:latest - # Step 3: Benchmark speculative decoding (VLLM backend, Kimi served at TP=4) task_2: script: common/specdec_bench/quick_check.sh args: @@ -135,7 +92,6 @@ pipeline: - --speculative_algorithm EAGLE3 - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl - --concurrency 32 - # Kimi has custom modeling code; bench run.py loads base+tokenizer and needs this. - --trust_remote_code environment: - HF_MODEL_CKPT: <> diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml index b2eea8c1ec6..7c37015d90d 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml @@ -1,44 +1,26 @@ -# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM. +# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM (in-process +# AsyncLLM, TP=4 + EP), benchmarking on MT-Bench. Outputs to /scratchspace/specdec_bench/. # -# Serves Kimi-K2.5-NVFP4 in-process (no HTTP server — specdec_bench drives an -# AsyncLLM) at TP=4 with expert parallelism, attaches a trained/exported DFLASH -# draft, and benchmarks speculative decoding on MT-Bench. Writes timing.json + -# aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl -# to /scratchspace/specdec_bench/. +# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi-K2.5-NVFP4 at TP=4 +# on one 4-GPU node. # -# Requires GB200/Blackwell (HSG): Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the -# 192 GB/GPU of GB200 to fit at TP=4 on ONE 4-GPU node with no cpu-offload. +# DFLASH: draft tokens default to 8 (=block_size); --draft_length does NOT apply. +# To override sampling/engine args, add `- --runtime_params ` (see +# examples/specdec_bench/README.md). # -# DFLASH specifics: -# - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8); -# --draft_length does NOT apply to DFLASH. To override sampling / engine args -# (e.g. speculative_num_draft_tokens, temperature), write a runtime-params -# yaml and add `- --runtime_params ` below — see -# examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern). -# - --draft_model_dir must point at a trained+exported HF-format DFLASH draft -# (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Set it via the -# `draft_model_dir` global_var below, or override on the CLI: -# pipeline.global_vars.draft_model_dir=/hf-local/ -# - Kimi needs --trust_remote_code for both tokenizer and model. +# NOTE on dataset: MT-Bench needs no data-prep. For SPEED-Bench instead, first run +# `prepare_data.py --dataset speed --config all`, then replace --mtbench with +# `--dataset speed` + `--dataset_path .../data/speed/`. # -# NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so -# it runs without any data-prep step. To benchmark on SPEED-Bench instead, first -# generate + stage a split: -# python3 examples/specdec_bench/prepare_data.py --dataset speed --config all -# (splits: qualitative, throughput_1k, throughput_16k, ...) then swap the -# `--mtbench` arg for: -# - --dataset speed -# - --dataset_path modules/Model-Optimizer/examples/specdec_bench/data/speed/throughput_16k +# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH; on +# GB200/aarch64 use an aarch64 DFLASH-capable image (e.g. a 0511 nightly), via +# pipeline.task_0.slurm_config.container=. UNRESOLVED. # -# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH support; -# on GB200/aarch64 use an aarch64 vLLM image new enough for DFLASH (validated on -# a 0511 nightly). Override with: pipeline.task_0.slurm_config.container= -# -# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy): -# export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \ +# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): +# export SLURM_HOST=localhost SLURM_ACCOUNT= \ # SLURM_PARTITION=batch \ -# SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \ -# SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \ +# SLURM_HF_LOCAL= \ +# SLURM_JOB_DIR= \ # NEMORUN_HOME=$PWD # uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/specdec_bench.yaml \ # identity=$HOME/.ssh/id_ecdsa detach=True --yes @@ -52,8 +34,7 @@ pipeline: global_vars: hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4 - # Trained + exported HF-format DFLASH draft checkpoint. Defaults to the standard - # export path; override on the CLI with: pipeline.global_vars.draft_model_dir= + # Trained+exported DFLASH draft; override: pipeline.global_vars.draft_model_dir= draft_model_dir: /hf-local/nvidia/Kimi-K2.5-DFlash task_0: From 9760dec8eee9e568d415524a94d803839eda377e Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Thu, 4 Jun 2026 23:54:27 +0000 Subject: [PATCH 13/14] address comments Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../speculative/plugins/hf_streaming_dataset.py | 13 +++++++++++-- .../launcher/common/eagle3/train_eagle_streaming.sh | 7 ++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py index 65b2cd4f0d7..1dd38b64154 100644 --- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py +++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py @@ -374,7 +374,9 @@ def _client(self) -> httpx.Client: DataLoader workers are forked processes; httpx connection pools must not be shared across a fork, so each process gets its own client (and its own - round-robin cursor over ``server_urls``), keyed by PID. + round-robin cursor over ``server_urls``), keyed by PID. The cursor starts + at a per-(rank, worker) offset so cold-start fetches fan out across + replicas instead of all hitting ``server_urls[0]``. """ pid = os.getpid() if getattr(self, "_client_pid", None) != pid: @@ -382,7 +384,14 @@ def _client(self) -> httpx.Client: timeout=httpx.Timeout(self.config.request_timeout, connect=10.0) ) self._client_pid = pid - self._rr = 0 + # Stagger the initial cursor by (rank, worker) so cold-start fetches + # fan out instead of all pinning server_urls[0] (which can flood one + # cold replica past its execute-model timeout and kill the EngineCore). + info = torch.utils.data.get_worker_info() + worker_id = info.id if info is not None else 0 + num_workers = info.num_workers if info is not None else 1 + rank = int(os.environ.get("RANK", "0")) + self._rr = rank * num_workers + worker_id return self._http def _next_url(self) -> str: diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh index 6f9636c459a..49b54709d35 100755 --- a/tools/launcher/common/eagle3/train_eagle_streaming.sh +++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh @@ -236,7 +236,12 @@ run_trainer_and_export() { # custom-modeling base (e.g. Kimi) pass --trust_remote_code; empty by default. local out_dir out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1) - out_dir="${out_dir:-/scratchspace/eagle3}" + # Fail loud rather than guess a default: a wrong dir would silently export the + # wrong checkpoint. Every streaming yaml already forwards training.output_dir=. + if [ -z "$out_dir" ]; then + echo "ERROR: no training.output_dir= forwarded in SCRIPT_ARGS; cannot locate checkpoint to export." >&2 + return 1 + fi python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \ --model_path "$out_dir" \ --export_path "${EXPORT_PATH:-/scratchspace/export}" \ From 4c33ef57980170ea11dc39bce5e1b46f89da9128 Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Fri, 5 Jun 2026 00:38:54 +0000 Subject: [PATCH 14/14] kimi answer only Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- examples/speculative_decoding/eagle_utils.py | 4 +- modelopt/torch/speculative/eagle/utils.py | 26 +++- .../plugins/hf_streaming_dataset.py | 27 ++-- modelopt/torch/utils/__init__.py | 1 + modelopt/torch/utils/loss_mask.py | 139 ++++++++++++++++++ .../Kimi-K2.5/hf_streaming_dflash.yaml | 10 +- .../hf_streaming_dflash_multi_node.yaml | 5 +- 7 files changed, 192 insertions(+), 20 deletions(-) create mode 100644 modelopt/torch/utils/loss_mask.py diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py index f3ef93d740e..626ea786237 100644 --- a/examples/speculative_decoding/eagle_utils.py +++ b/examples/speculative_decoding/eagle_utils.py @@ -139,7 +139,9 @@ def make_speculative_data_module( raise ValueError("sample_size must be -1 (use all samples) or a positive integer") if data_args.sample_size > 0: dumped_files = dumped_files[: data_args.sample_size] - train_dataset = OfflineSupervisedDataset(dumped_files, answer_only_loss=answer_only_loss) + train_dataset = OfflineSupervisedDataset( + dumped_files, answer_only_loss=answer_only_loss, tokenizer=tokenizer + ) data_collator = EagleOfflineDataCollator(train_len=train_len) return { diff --git a/modelopt/torch/speculative/eagle/utils.py b/modelopt/torch/speculative/eagle/utils.py index f74fcb1e9fb..2c536d04991 100644 --- a/modelopt/torch/speculative/eagle/utils.py +++ b/modelopt/torch/speculative/eagle/utils.py @@ -41,6 +41,8 @@ from torch.utils.data import Dataset from transformers.trainer_pt_utils import LabelSmoother +from modelopt.torch.utils.loss_mask import get_loss_mask_recovery + IGNORE_TOKEN_ID = LabelSmoother.ignore_index @@ -96,20 +98,27 @@ class OfflineSupervisedDataset(Dataset): dumped_files (list): A list of file paths to the dumped .pt files. answer_only_loss (bool): If True, use the ``loss_mask`` stored in each .pt file so that only assistant-produced tokens contribute to the loss. - Raises ``ValueError`` on ``__getitem__`` if the file lacks ``loss_mask``. + If a file lacks ``loss_mask`` and ``tokenizer`` has a registered + model-specific recovery (see ``modelopt.torch.utils.loss_mask``), the + mask is rebuilt from ``input_ids``; otherwise ``__getitem__`` raises + ``ValueError``. If False (default), a uniform all-ones mask is used regardless of what is stored in the file (backward compatible). + tokenizer: Optional tokenizer used to recover the assistant mask for dumps + that lack a stored ``loss_mask``. """ def __init__( self, dumped_files, answer_only_loss: bool = False, + tokenizer=None, ): """Initialize with a list of .pt file paths.""" super().__init__() self.dumped_files = dumped_files self.answer_only_loss = answer_only_loss + self.tokenizer = tokenizer def __len__(self): return len(self.dumped_files) @@ -121,13 +130,22 @@ def __getitem__(self, i) -> dict[str, torch.Tensor]: labels[..., :-1] = offline_data["input_ids"][..., 1:] if self.answer_only_loss: - if "loss_mask" not in offline_data: + recovery = get_loss_mask_recovery(self.tokenizer) if self.tokenizer else None + if "loss_mask" in offline_data: + loss_mask = offline_data["loss_mask"].to(offline_data["input_ids"].dtype) + elif recovery is not None: + # Dumps from tokenizers that cannot emit assistant masks carry no + # loss_mask; rebuild it from the token ids. + loss_mask = recovery.compute(self.tokenizer, offline_data["input_ids"]).to( + offline_data["input_ids"].dtype + ) + else: raise ValueError( f"answer_only_loss=True requires a 'loss_mask' entry in the offline " f".pt file, but {self.dumped_files[i]} does not have one. Re-dump " - f"with --answer-only-loss in compute_hidden_states_*.py." + f"with --answer-only-loss in compute_hidden_states_*.py, or pass a " + f"tokenizer with a registered loss-mask recovery." ) - loss_mask = offline_data["loss_mask"].to(offline_data["input_ids"].dtype) else: loss_mask = torch.ones_like(offline_data["input_ids"]) diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py index 1dd38b64154..c1be45e9e56 100644 --- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py +++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py @@ -61,6 +61,7 @@ from transformers.trainer_pt_utils import LabelSmoother from modelopt.torch.utils import print_rank_0, warn_rank_0 +from modelopt.torch.utils.loss_mask import get_loss_mask_recovery __all__ = [ "EagleFetchPayload", @@ -100,31 +101,39 @@ def _tokenize_with_loss_mask( tags so the tokenizer can return ``assistant_masks``. When ``max_seq_len`` is set, truncation is delegated to the tokenizer so ids and assistant_masks are truncated in lockstep. + + ``assistant_masks`` requires a fast tokenizer (it needs ``char_to_token``). For + tokenizers without it, the mask is rebuilt from token ids via a registered + model-specific recovery (see ``modelopt.torch.utils.loss_mask``) if one matches. """ + recovery = None + if answer_only_loss and not getattr(tokenizer, "is_fast", False): + recovery = get_loss_mask_recovery(tokenizer) out = tokenizer.apply_chat_template( conversations, tokenize=True, return_tensors="pt", return_dict=True, - return_assistant_tokens_mask=answer_only_loss, + return_assistant_tokens_mask=answer_only_loss and recovery is None, add_generation_prompt=False, truncation=max_seq_len is not None, max_length=max_seq_len, ) input_ids = out["input_ids"] seq_len = input_ids.shape[-1] - if answer_only_loss: + if not answer_only_loss: + loss_mask = torch.ones(seq_len, dtype=torch.long) + elif recovery is not None: + loss_mask = recovery.compute(tokenizer, input_ids[0]) + else: mask = out["assistant_masks"] if not isinstance(mask, torch.Tensor): mask = torch.tensor(mask, dtype=torch.long) loss_mask = mask.squeeze(0).to(torch.long) - if loss_mask.shape[0] != seq_len: - raise RuntimeError( - f"assistant_masks length {loss_mask.shape[0]} does not match " - f"input_ids length {seq_len}" - ) - else: - loss_mask = torch.ones(seq_len, dtype=torch.long) + if loss_mask.shape[0] != seq_len: + raise RuntimeError( + f"loss_mask length {loss_mask.shape[0]} does not match input_ids length {seq_len}" + ) return input_ids, loss_mask diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py index 51d02248c14..a38c80cac01 100644 --- a/modelopt/torch/utils/__init__.py +++ b/modelopt/torch/utils/__init__.py @@ -22,6 +22,7 @@ from .import_utils import * from .list import * from .logging import * +from .loss_mask import * from .network import * from .perf import * from .regex import * diff --git a/modelopt/torch/utils/loss_mask.py b/modelopt/torch/utils/loss_mask.py new file mode 100644 index 00000000000..839bce24b8e --- /dev/null +++ b/modelopt/torch/utils/loss_mask.py @@ -0,0 +1,139 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model-specific recovery of the assistant loss mask. + +The standard way to build an answer-only loss mask is +``apply_chat_template(..., return_assistant_tokens_mask=True)``, which maps the +``{% generation %}`` template span to tokens via ``char_to_token`` -- and that is +only available on "fast" tokenizers. Some models ship only a slow/Python tokenizer +and cannot use this path. + +This module is a small registry of per-model fallbacks that recover the mask +directly from token ids, keyed by a ``detect`` predicate. Data paths consult +:func:`get_loss_mask_recovery` and stay free of any single model's chat-format +details. It is intentionally minimal and is meant to seed a broader model-specific +patch registry. +""" + +from collections.abc import Callable +from dataclasses import dataclass + +import torch + +__all__ = ["LossMaskRecovery", "get_loss_mask_recovery", "register_loss_mask_recovery"] + + +@dataclass(frozen=True) +class LossMaskRecovery: + """A model-specific fallback for building the assistant loss mask. + + Args: + name: Identifier for the target model family (for logging/debugging). + detect: Returns ``True`` if this recovery applies to the given tokenizer. + compute: Maps ``(tokenizer, input_ids)`` to a ``(seq_len,)`` ``LongTensor`` + mask aligned to ``input_ids`` (1 on tokens that should contribute to + the loss, 0 otherwise). + """ + + name: str + detect: Callable[[object], bool] + compute: Callable[[object, torch.Tensor], torch.Tensor] + + +_RECOVERIES: list[LossMaskRecovery] = [] + + +def register_loss_mask_recovery(recovery: LossMaskRecovery) -> None: + """Register a model-specific loss-mask recovery.""" + _RECOVERIES.append(recovery) + + +def get_loss_mask_recovery(tokenizer) -> LossMaskRecovery | None: + """Return the first registered recovery whose ``detect`` matches ``tokenizer``.""" + for recovery in _RECOVERIES: + if recovery.detect(tokenizer): + return recovery + return None + + +# --------------------------------------------------------------------------- +# Kimi +# +# Kimi ships only a Python (tiktoken) tokenizer, so it cannot emit assistant masks +# via apply_chat_template. Its chat turns are rendered as +# <|im_{role}|> {role_name} <|im_middle|> {content} <|im_end|> +# so the assistant content sits between <|im_middle|> and <|im_end|>. +# --------------------------------------------------------------------------- + +_KIMI_ROLE_MARKERS = ("<|im_user|>", "<|im_assistant|>", "<|im_system|>") + + +def _kimi_detect(tokenizer) -> bool: + """Whether ``tokenizer`` defines Kimi's chat role markers as real tokens.""" + unk = getattr(tokenizer, "unk_token_id", None) + try: + ids = [ + tokenizer.convert_tokens_to_ids(t) + for t in (*_KIMI_ROLE_MARKERS, "<|im_middle|>", "<|im_end|>") + ] + except Exception: + return False + return all(i is not None and i != unk for i in ids) + + +def _kimi_compute(tokenizer, input_ids) -> torch.Tensor: + """Recover the assistant-content mask from already-tokenized Kimi chat ids. + + Marks only the ``{content}`` span (between ``<|im_middle|>`` and ``<|im_end|>``, + both exclusive). This matches the ``{% generation %}`` span used for fast + tokenizers: the role header and the trailing ``<|im_end|>`` are not masked. + """ + ids = input_ids.tolist() if hasattr(input_ids, "tolist") else list(input_ids) + assistant_id = tokenizer.convert_tokens_to_ids("<|im_assistant|>") + middle_id = tokenizer.convert_tokens_to_ids("<|im_middle|>") + end_id = tokenizer.convert_tokens_to_ids("<|im_end|>") + role_ids = {tokenizer.convert_tokens_to_ids(t) for t in _KIMI_ROLE_MARKERS} + + n = len(ids) + mask = [0] * n + i = 0 + while i < n: + if ids[i] != assistant_id: + i += 1 + continue + # Skip the role header (role_name) up to its <|im_middle|> separator. + j = i + 1 + while j < n and ids[j] != middle_id and ids[j] not in role_ids and ids[j] != end_id: + j += 1 + if j >= n or ids[j] != middle_id: + # Malformed turn (no content separator) or a trailing generation prompt. + i = j + continue + # Mark the content span [middle + 1, end): excludes <|im_middle|> and <|im_end|>. + start = j + 1 + k = start + while k < n and ids[k] != end_id and ids[k] not in role_ids: + k += 1 + for t in range(start, k): + mask[t] = 1 + i = k + + return torch.tensor(mask, dtype=torch.long) + + +register_loss_mask_recovery( + LossMaskRecovery(name="kimi", detect=_kimi_detect, compute=_kimi_compute) +) diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml index 50475b1a28e..8f82b1919b1 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml @@ -9,8 +9,10 @@ # Capture ids = [2,16,31,45,59,60] (kimi_k25/deepseek_v3, 61 layers): 5 DFlash # target layers + base 60. n_captured = num_target_layers + 1. # -# answer_only_loss forced false: Kimi's chat template lacks {% generation %} tags -# needed to derive the assistant-token mask; flip on with a tagged template. +# answer_only_loss=true: Kimi ships only a slow tokenizer, so it can't derive the +# assistant mask the standard way (return_assistant_tokens_mask needs a fast +# tokenizer's char_to_token). The mask is instead recovered from token ids by the +# registered model-specific recovery in modelopt.torch.utils.loss_mask. # # Run ON the cluster login node (paramiko can't reach the cluster through its login proxy): # export SLURM_HOST=localhost SLURM_ACCOUNT= \ @@ -61,8 +63,8 @@ pipeline: - training.disable_tqdm=true - training.num_train_epochs=1 - training.max_steps=3000 - # Kimi's template lacks {% generation %} tags; train on all tokens (see header). - - training.answer_only_loss=false + # Assistant mask recovered from token ids for Kimi's slow tokenizer (see header). + - training.answer_only_loss=true # vLLM container has no tensorboard (dflash.yaml's default report_to); disable. - training.report_to=none # Kimi-K2.5 has no dedicated mask token; 163838 is a reserved slot used as one. diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml index fedf729a038..6b70e94d262 100644 --- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml +++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml @@ -62,8 +62,9 @@ pipeline: - training.ar_validate_steps=500000 - training.num_train_epochs=1 - training.max_steps=500 - # Kimi's template lacks {% generation %} tags, so train on all tokens. - - training.answer_only_loss=false + # Kimi's slow tokenizer can't emit assistant masks the standard way; the mask + # is recovered from token ids (modelopt.torch.utils.loss_mask). + - training.answer_only_loss=true # vLLM container has no tensorboard (dflash.yaml's default) -> init crash. - training.report_to=none # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask.