From 5e696514411572ed6cffd659c93d4d799a74da02 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Tue, 2 Jun 2026 22:44:21 +0000
Subject: [PATCH 01/14] multinode streaming; k2.5 example

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/specdec_bench/specdec_bench/utils.py |   4 +
 examples/speculative_decoding/launch_train.sh |   8 +-
 modelopt/recipe/config.py                     |   6 +-
 modelopt/torch/speculative/config.py          |   6 +-
 .../common/eagle3/train_eagle_streaming.sh    | 124 +++++++++++-----
 tools/launcher/core.py                        |   3 +
 .../Kimi-K2.5/hf_dflash_dryrun.yaml           |  64 +++++++++
 .../Kimi-K2.5/hf_streaming_dflash.yaml        | 131 +++++++++++++++++
 .../hf_streaming_dflash_multi_node.yaml       | 133 ++++++++++++++++++
 .../moonshotai/Kimi-K2.5/specdec_bench.yaml   |  81 +++++++++++
 tools/launcher/slurm_config.py                |   7 +
 11 files changed, 530 insertions(+), 37 deletions(-)
 create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
 create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
 create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
 create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml

diff --git a/examples/specdec_bench/specdec_bench/utils.py b/examples/specdec_bench/specdec_bench/utils.py
index 9a52d0ceac2..73d1e048c80 100644
--- a/examples/specdec_bench/specdec_bench/utils.py
+++ b/examples/specdec_bench/specdec_bench/utils.py
@@ -196,6 +196,10 @@ def _checkpoint_provenance(model_dir):
 
 
 def _is_sensitive_key(key):
+    # Engine configs can carry non-string dict keys (e.g. int layer ids in a
+    # serving_config); those are never sensitive field *names*, so skip them.
+    if not isinstance(key, str):
+        return False
     klow = key.lower()
     if klow in _SENSITIVE_KEY_ALLOWLIST:
         return False
diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
index 41d71d14173..fc623930767 100755
--- a/examples/speculative_decoding/launch_train.sh
+++ b/examples/speculative_decoding/launch_train.sh
@@ -30,12 +30,14 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 CONFIG_FILE=""
 NUM_NODES=1
 HEAD_NODE_IP=""
+MACHINE_RANK=""
 EXTRA_ARGS=()
 while [ $# -gt 0 ]; do
   case "$1" in
     --config*)     if [[ "$1" != *=* ]]; then shift; fi; CONFIG_FILE="${1#*=}" ;;
     --num_nodes*)  if [[ "$1" != *=* ]]; then shift; fi; NUM_NODES="${1#*=}" ;;
     --head_node_ip*) if [[ "$1" != *=* ]]; then shift; fi; HEAD_NODE_IP="${1#*=}" ;;
+    --machine_rank*) if [[ "$1" != *=* ]]; then shift; fi; MACHINE_RANK="${1#*=}" ;;
     *) EXTRA_ARGS+=("$1") ;;
   esac
   shift
@@ -59,9 +61,13 @@ fi
 # Multi-node routing args (accelerate only; training config comes from the YAML)
 MULTI_NODE_ARGS=""
 if [[ "$NUM_NODES" != "1" ]]; then
+  # machine_rank: caller may pass --machine_rank explicitly (needed when the
+  # SLURM allocation reserves node 0 for something else, e.g. the streaming
+  # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank).
+  # Default to $SLURM_PROCID for the all-nodes-are-trainers case.
   MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \
                    --num_machines $NUM_NODES \
-                   --machine_rank $SLURM_PROCID \
+                   --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \
                    --rdzv_backend c10d \
                    --main_process_ip $HEAD_NODE_IP \
                    --main_process_port 29500"
diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py
index 4bf91b52d6f..97d93bbafc6 100644
--- a/modelopt/recipe/config.py
+++ b/modelopt/recipe/config.py
@@ -178,7 +178,11 @@ class ModelOptDFlashRecipe(ModelOptSpeculativeRecipeBase):
 
     @model_validator(mode="after")
     def _derive_dflash_offline(self) -> ModelOptDFlashRecipe:
-        self.dflash.dflash_offline = self.data.offline_data_path is not None
+        # offline (dumped .pt) and streaming (hidden states over HTTP from a vLLM
+        # serve) both feed pre-computed base hidden states to the DFlash module, so
+        # both set dflash_offline. Only fully-online training runs the base model.
+        # Mirrors ModelOptEagleRecipe._derive_eagle_offline.
+        self.dflash.dflash_offline = self.data.mode != "online"
         return self
 
 
diff --git a/modelopt/torch/speculative/config.py b/modelopt/torch/speculative/config.py
index 6b2c9396ce7..23ad200b6e7 100644
--- a/modelopt/torch/speculative/config.py
+++ b/modelopt/torch/speculative/config.py
@@ -68,8 +68,10 @@ class DFlashConfig(ModeloptBaseConfig):
     dflash_offline: bool = ModeloptField(
         default=False,
         description=(
-            "Whether to use detached DFlash (offline training from pre-computed hidden states). "
-            "Derived by ModelOptDFlashRecipe from data.offline_data_path; not user-configurable."
+            "Whether the DFlash module consumes pre-computed hidden states (offline from "
+            "dumped .pt files, or streaming over HTTP from a vLLM serve) instead of running "
+            "the base model. Derived by ModelOptDFlashRecipe from data.mode (True unless "
+            "online); not user-configurable."
         ),
     )
 
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index 158bd7a0cf6..4a8dc8bbacf 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -24,12 +24,19 @@
 # $SLURM_NODEID:
 #   nodes == 1  -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of
 #                  the local GPUs (original single-node behavior).
-#   nodes >= 2  -> split across nodes: node 0 runs vllm serve on all its GPUs,
-#                  node 1 runs the trainer on all its GPUs. The two roles
-#                  rendezvous through the shared /scratchspace mount (node 0
-#                  publishes its address; node 1 signals completion). For large
-#                  models whose serve needs a whole node (e.g. Kimi-K2.5 TP=8),
-#                  allocate exactly 2 nodes.
+#   nodes == 2  -> split: node 0 runs vllm serve on all its GPUs, node 1 runs
+#                  the trainer on all its GPUs. Roles rendezvous through the
+#                  shared /scratchspace mount (node 0 publishes its serve
+#                  address; the trainer signals completion).
+#   nodes >= 3  -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1)
+#                  doing multi-node DDP. The head trainer (node 1, accelerate
+#                  machine_rank 0) publishes its IP for accelerate's c10d
+#                  rendezvous; all trainer nodes read both the serve address and
+#                  the head-trainer address from /scratchspace. NOTE: only global
+#                  rank 0 fetches hidden states from the single serve and
+#                  broadcasts to the rest (DataLoaderDispatcher), so the single
+#                  serve is the throughput ceiling — adding trainer nodes scales
+#                  effective batch / compute, not data-production throughput.
 #
 # Env vars (required):
 #   HF_MODEL_CKPT       Target model path. Used by both vllm serve (as the
@@ -56,7 +63,8 @@
 #   TRAIN_GPUS          single-node only: CUDA_VISIBLE_DEVICES for the trainer.
 #                       default = all local GPUs except SERVE_GPU.
 #   SERVE_ADVERTISE_IP  multi-node only: address node 1 should dial. default is
-#                       node 0's first `hostname -I` IP.
+#                       node 0's routable IP (its resolved Slurm node name, else
+#                       its first non-loopback / non-link-local IP).
 #
 # All script args are forwarded to launch_train.sh (typically: --config <yaml>
 # plus OmegaConf dotlist overrides).
@@ -112,7 +120,7 @@ export PATH=$PATH:/workspace/.local/bin
 
 ###################################################################################################
 
-trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER
+trap 'error_handler $0 $LINENO' ERR
 
 if [ -z "$HF_MODEL_CKPT" ]; then
     echo "ERROR: HF_MODEL_CKPT must be set." >&2; exit 1
@@ -154,11 +162,9 @@ launch_vllm() {
     # would expose *zero* GPUs (not all), so leave it unset to use the whole node.
     local -a gpu_env=()
     [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
-    # Optional single-value memory knobs (each a space-free env value, so they
-    # survive nemo_run's unquoted `export FOO=value`; assembled into --flag value
-    # pairs here). --cpu-offload-gb spills N GB of weights/GPU to host RAM, the
-    # key lever for fitting a large model on too-few GPUs (slower, prefill-only
-    # use tolerates it). --max-model-len / --max-num-seqs trim KV/activation.
+    # Optional single-value memory knobs (see header), assembled into --flag
+    # value pairs. Each is a space-free env value so it survives nemo_run's
+    # unquoted `export FOO=value`.
     local -a opt_args=()
     [ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB")
     [ -n "${SERVE_MAX_MODEL_LEN:-}" ]  && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN")
@@ -222,28 +228,52 @@ wait_vllm_ready() {
 # per process; multiple workers would duplicate requests against the server.
 run_trainer_and_export() {
     local url="$1" cvd="$2"
-    echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all})..."
+    # Optional multi-node trainer routing (see dispatch section). Defaults keep
+    # the original single-trainer-node behavior: no --num_nodes, export on rank 0.
+    local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}"
+    echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..."
     # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all).
     local -a gpu_env=()
     [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
+    # Engage accelerate multi-node routing only when >1 trainer node; a single
+    # trainer node keeps the original invocation (no --num_nodes) verbatim.
+    local -a mn_args=()
+    if [ "${num_tnodes}" -gt 1 ]; then
+        mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank")
+    fi
     "${gpu_env[@]}" bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \
         "${SCRIPT_ARGS[@]}" \
+        "${mn_args[@]}" \
         data.streaming_server_url="$url" \
         data.streaming_model_name="$HF_MODEL_CKPT" \
         data.streaming_shared_storage_path="$SERVE_SCRATCH" \
         training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; }
 
+    # Export only on the head trainer (machine_rank 0); non-head trainer nodes
+    # would race writing the same export dir. The export reads the saved
+    # checkpoint (training.output_dir), not the serve, so it is serve-independent.
+    if [ "${mrank}" -ne 0 ]; then
+        echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)."
+        return 0
+    fi
+
+    # Export the trained draft to HF format. Derive the checkpoint dir from the
+    # forwarded `training.output_dir=` dotlist (defaulting to the EAGLE
+    # convention) so EAGLE and DFlash runs each export their own output_dir.
+    # EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass
+    # --trust_remote_code; empty by default so EAGLE behavior is unchanged.
+    local out_dir
+    out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1)
+    out_dir="${out_dir:-/scratchspace/eagle3}"
     python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \
-        --model_path /scratchspace/eagle3 \
-        --export_path /scratchspace/export
+        --model_path "$out_dir" \
+        --export_path "${EXPORT_PATH:-/scratchspace/export}" \
+        ${EXPORT_EXTRA_ARGS:-}
 }
 
 # ---------------------------------------------------------------------------
-# Topology dispatch (driven by the Slurm allocation, i.e. the yaml `nodes:`):
-#   SLURM_NNODES == 1  -> co-located: vllm on $SERVE_GPU, trainer on the rest.
-#   SLURM_NNODES >= 2  -> split: node 0 serves on all its GPUs, node 1 trains on
-#                         all its GPUs; they rendezvous via /scratchspace.
-# nemo_run runs this script once per node, so we branch on $SLURM_NODEID.
+# Topology dispatch (see header): nemo_run runs this script once per node, so
+# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads.
 # ---------------------------------------------------------------------------
 NNODES="${SLURM_NNODES:-1}"
 NODEID="${SLURM_NODEID:-0}"
@@ -299,27 +329,55 @@ elif [ "$NODEID" -eq 0 ]; then
     while [ ! -f "$DONE_FILE" ]; do sleep 10; done
     echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)."
 
-elif [ "$NODEID" -eq 1 ]; then
-    # ---------------------- multi-node: trainer node -----------------------
-    # Release the serve node on any exit (success or failure) so it doesn't hang.
-    trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
+elif [ "$NODEID" -ge 1 ]; then
+    # -------------------- multi-node: trainer node(s) ----------------------
+    # Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which
+    # map to 0-based accelerate machine ranks (head trainer = SLURM node 1).
+    NUM_TRAINER_NODES=$(( NNODES - 1 ))
+    TRAINER_RANK=$(( NODEID - 1 ))
+    TRAINER_ADDR_FILE="/scratchspace/.trainer_addr"
+
+    # Only the head trainer (rank 0) signals the serve node to release on exit;
+    # a non-head node exiting first must NOT tear the serve down early.
+    if [ "$TRAINER_RANK" -eq 0 ]; then
+        trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
+        rm -f "$TRAINER_ADDR_FILE"                 # clear stale rendezvous state
+    fi
 
-    echo "Trainer node waiting (up to ${SERVE_READY_TIMEOUT}s) for the serve address..."
+    echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..."
     for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
         [ -f "$SERVE_ADDR_FILE" ] && break
         sleep 1
     done
     [ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; }
     URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}"
-
     wait_vllm_ready "$URL" || exit 1
-    run_trainer_and_export "$URL" "" || exit 1
 
-else
-    # ------------- multi-node: extra nodes (unused by default) -------------
-    echo "Node rank ${NODEID} idle: the default split uses node 0 = vllm serve, node 1 = trainer."
-    echo "Multi-node *training* (>1 trainer node) is not wired up yet; allocate exactly 2 nodes."
-    while [ ! -f "$DONE_FILE" ]; do sleep 10; done
+    if [ "$NUM_TRAINER_NODES" -le 1 ]; then
+        # Original 1-serve + 1-trainer topology: single-node DDP, unchanged.
+        run_trainer_and_export "$URL" "" || exit 1
+    else
+        # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's
+        # c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse
+        # the serve node's IP-resolution logic (avoid link-local / loopback).
+        if [ "$TRAINER_RANK" -eq 0 ]; then
+            head_addr="${TRAINER_ADVERTISE_IP:-}"
+            [ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
+            [ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1)
+            [ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}')
+            echo "$head_addr" > "$TRAINER_ADDR_FILE"
+            echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous."
+        else
+            echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..."
+            for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
+                [ -f "$TRAINER_ADDR_FILE" ] && break
+                sleep 1
+            done
+            [ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; }
+        fi
+        HEAD_IP=$(cat "$TRAINER_ADDR_FILE")
+        run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1
+    fi
 fi
 
 ###################################################################################################
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index aa60bbad9e9..f6ae6493af3 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -286,6 +286,9 @@ def build_slurm_executor(
         retries=0,
         packager=packager,
         srun_args=slurm_config.srun_args,
+        # --segment=<N>: pin all nodes into one topology block (one NVL72 / NVLink
+        # domain). None -> omitted, scheduler places freely (default behavior).
+        segment=slurm_config.segment,
     )
     return executor
 
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
new file mode 100644
index 00000000000..b12c3b0f538
--- /dev/null
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
@@ -0,0 +1,64 @@
+# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4).
+#
+# Single-task pipeline that exercises the full convert→save→export path WITHOUT
+# actually training. Uses the same `common/specdec/dflash_online_training.sh`
+# entrypoint as a real DFlash run; all dry-run behaviour is expressed as dotlist
+# overrides on `main.py` (shared with EAGLE3 — `--dry_run` is mode-agnostic):
+#
+#   --dry_run                              → main.py skips trainer.train(), saves
+#                                            the (untrained) ModelOpt checkpoint
+#                                            to training.output_dir right after
+#                                            mtsp.convert(model, [("dflash", ...)])
+#   data.offline_data_path=<placeholder>   → DataArguments derives data.mode from
+#                                            the data-source fields, so setting an
+#                                            offline path makes mode='offline' →
+#                                            use_offline_training=True. Combined
+#                                            with use_fake_base_for_offline=true
+#                                            this loads a FakeBaseModel (only
+#                                            embed_tokens + lm_head), so the ~1T
+#                                            MoE base fits on a single GPU. The
+#                                            file is never read in --dry_run mode.
+#   model.trust_remote_code=true           → Kimi-K2.5 (deepseek_v3 arch) ships a
+#                                            custom modeling file
+#   dflash.dflash_mask_token_id=163838     → Kimi-K2.5 has no dedicated mask token
+#                                            ([EOS]=163585, [PAD]=163839); 163838 is
+#                                            a reserved slot used as the DFlash mask
+#                                            (matches the real Kimi-K2.5 DFlash run)
+#
+# The dflash_online_training.sh export block then writes an HF-format DFlash draft
+# to /scratchspace/dflash/exported-checkpoint-final with the correct architecture
+# (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by
+# design. Useful for smoke-testing the launcher / convert / export plumbing and
+# validating downstream loaders without paying for a real training run.
+#
+# Usage:
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes
+
+job_name: Kimi-K2.5_DFlash_dryrun
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4/
+
+  # Convert → save → export (no training).
+  task_0:
+    script: common/specdec/dflash_online_training.sh
+    args:
+      - --dry_run
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.use_fake_base_for_offline=true
+      - model.trust_remote_code=true
+      - data.offline_data_path=/tmp/dryrun-placeholder
+      - training.output_dir=/scratchspace/dflash
+      - training.disable_tqdm=true
+      - dflash.dflash_mask_token_id=163838
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
new file mode 100644
index 00000000000..ff99ae62c7f
--- /dev/null
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
@@ -0,0 +1,131 @@
+# DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
+# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve +
+# trainer split, same hardware reasoning — but trains a DFlash drafter instead of
+# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the
+# dflash recipe.
+#
+# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
+# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
+# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4,
+# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes.
+#
+# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
+# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
+# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets
+# dflash_offline=True and the DFlash module consumes the streamed hidden states
+# (base_model_outputs) instead of running the fake base. The vLLM connector,
+# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps
+# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST
+# captured layer into base_model_hidden_states (used for DFlash self-logit
+# distillation) and the REST into aux_hidden_states (DFlash's concatenated
+# target-layer features). So n_captured must be (num DFlash target layers + 1).
+#
+# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
+# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3):
+#   DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5)
+#   = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) =
+#   [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the
+#   dataset yields 5 aux layers, matching the 5-layer DFlash draft block.
+#
+# answer_only_loss: forced false here. DFlash's recipe default is true, which
+# requires the tokenizer chat template to carry {% generation %} tags so the
+# streaming dataset can derive an assistant-token mask; Kimi's template does not,
+# and the streaming path (unlike online) does not inject data.chat_template. To
+# train assistant-only later, supply a generation-tagged template and flip this on.
+#
+# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+#          SLURM_PARTITION=batch \
+#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
+#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          NEMORUN_HOME=$PWD
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \
+#          identity=$HOME/.ssh/id_ecdsa detach=True --yes
+#
+# The export lands in /scratchspace/export. To benchmark it, point
+# specdec_bench.yaml's --draft_model_dir there (or copy it under /hf-local).
+
+job_name: Kimi-K2.5-NVFP4_DFlash_streaming
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+
+  # Step 1: Build input conversations (model-agnostic)
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
+      # so request 4 even though make_dataset is CPU-only.
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node.
+  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh):
+  # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.use_fake_base_for_offline=true
+      - model.trust_remote_code=true
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4
+      # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker
+      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
+      - data.streaming_prefetch=8
+      - training.output_dir=/scratchspace/dflash
+      # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.num_train_epochs=1
+      - training.max_steps=3000
+      # See header: Kimi's template lacks {% generation %} tags; train on all tokens.
+      - training.answer_only_loss=false
+      # dflash.yaml sets report_to=tensorboard, but the vLLM container has no
+      # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable
+      # reporting (loss still prints to stdout via logging_steps).
+      - training.report_to=none
+      # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838
+      # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run).
+      - dflash.dflash_mask_token_id=163838
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces in values: nemo_run emits `export FOO=value` unquoted.
+      # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
+      - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
+      - SERVE_TP: "4"
+      # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
+      - EXPORT_EXTRA_ARGS: "--trust_remote_code"
+      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
+      # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the
+      # training seq len and leave headroom for activation spikes.
+      - SERVE_MAX_MODEL_LEN: "4096"
+      # Small batches: smaller per-step MoE compute stays under the engine timeout.
+      - SERVE_MAX_NUM_SEQS: "4"
+      - SERVE_GPU_MEM_UTIL: "0.8"
+      - SERVE_READY_TIMEOUT: "2400"
+      - SERVE_EXTRA_ARGS: "--trust-remote-code"
+      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
+      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
+      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
+      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
+      - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
new file mode 100644
index 00000000000..fb92ba11234
--- /dev/null
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -0,0 +1,133 @@
+# DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
+# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve +
+# trainer split, same hardware reasoning — but trains a DFlash drafter instead of
+# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the
+# dflash recipe.
+#
+# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
+# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
+# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4,
+# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes.
+#
+# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
+# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
+# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets
+# dflash_offline=True and the DFlash module consumes the streamed hidden states
+# (base_model_outputs) instead of running the fake base. The vLLM connector,
+# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps
+# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST
+# captured layer into base_model_hidden_states (used for DFlash self-logit
+# distillation) and the REST into aux_hidden_states (DFlash's concatenated
+# target-layer features). So n_captured must be (num DFlash target layers + 1).
+#
+# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
+# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3):
+#   DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5)
+#   = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) =
+#   [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the
+#   dataset yields 5 aux layers, matching the 5-layer DFlash draft block.
+#
+# answer_only_loss: forced false here. DFlash's recipe default is true, which
+# requires the tokenizer chat template to carry {% generation %} tags so the
+# streaming dataset can derive an assistant-token mask; Kimi's template does not,
+# and the streaming path (unlike online) does not inject data.chat_template. To
+# train assistant-only later, supply a generation-tagged template and flip this on.
+#
+# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+#          SLURM_PARTITION=batch \
+#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
+#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          NEMORUN_HOME=$PWD
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \
+#          identity=$HOME/.ssh/id_ecdsa detach=True --yes
+#
+# The export lands in /scratchspace/export. To benchmark it, point
+# specdec_bench.yaml's --draft_model_dir there (or copy it under /hf-local).
+
+job_name: Kimi-K2.5-NVFP4_DFlash_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+
+  # Step 1: Build input conversations (model-agnostic)
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
+      # so request 4 even though make_dataset is CPU-only.
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node.
+  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh):
+  # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.use_fake_base_for_offline=true
+      - model.trust_remote_code=true
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4
+      # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker
+      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
+      - data.streaming_prefetch=8
+      - training.output_dir=/scratchspace/dflash
+      # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.num_train_epochs=1
+      - training.ar_validate_steps=500000
+      - training.max_steps=500
+      # See header: Kimi's template lacks {% generation %} tags; train on all tokens.
+      - training.answer_only_loss=false
+      # dflash.yaml sets report_to=tensorboard, but the vLLM container has no
+      # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable
+      # reporting (loss still prints to stdout via logging_steps).
+      - training.report_to=none
+      # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838
+      # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run).
+      - dflash.dflash_mask_token_id=163838
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces in values: nemo_run emits `export FOO=value` unquoted.
+      # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
+      - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
+      - SERVE_TP: "4"
+      # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
+      - EXPORT_EXTRA_ARGS: "--trust_remote_code"
+      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
+      # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the
+      # training seq len and leave headroom for activation spikes.
+      - SERVE_MAX_MODEL_LEN: "4096"
+      # Small batches: smaller per-step MoE compute stays under the engine timeout.
+      - SERVE_MAX_NUM_SEQS: "4"
+      - SERVE_GPU_MEM_UTIL: "0.8"
+      - SERVE_READY_TIMEOUT: "2400"
+      - SERVE_EXTRA_ARGS: "--trust-remote-code"
+      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
+      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
+      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
+      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
+      - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 3
+      segment: 3
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
new file mode 100644
index 00000000000..a943f39c27e
--- /dev/null
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
@@ -0,0 +1,81 @@
+# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM.
+#
+# Serves Kimi-K2.5-NVFP4 in-process (no HTTP server — specdec_bench drives an
+# AsyncLLM) at TP=4 with expert parallelism, attaches a trained/exported DFLASH
+# draft, and benchmarks speculative decoding on MT-Bench. Writes timing.json +
+# aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl
+# to /scratchspace/specdec_bench/.
+#
+# Hardware = GB200/Blackwell (HSG), same reasoning as hf_streaming_eagle3.yaml:
+# Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the 192 GB/GPU of GB200; it fits
+# at TP=4 on ONE 4-GPU node with no cpu-offload. On CW H100 it has no native FP4
+# and falls back to offload, so the working path is GB200.
+#
+# DFLASH specifics:
+#   - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8);
+#     --draft_length does NOT apply to DFLASH. To override sampling / engine args
+#     (e.g. speculative_num_draft_tokens, temperature), write a runtime-params
+#     yaml and add `- --runtime_params <path>` below — see
+#     examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern).
+#   - --draft_model_dir must point at a trained+exported HF-format DFLASH draft
+#     (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Edit the path
+#     below, or override on the CLI: pipeline.task_0.args[0]="--draft_model_dir /hf-local/<draft>"
+#   - Kimi needs --trust_remote_code for both tokenizer and model.
+#
+# NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so
+# it runs without any data-prep step. To benchmark on SPEED-Bench instead, first
+# generate + stage a split:
+#     python3 examples/specdec_bench/prepare_data.py --dataset speed --config all
+# (splits: qualitative, throughput_1k, throughput_16k, ...) then swap the
+# `--mtbench` arg for:
+#     - --dataset speed
+#     - --dataset_path modules/Model-Optimizer/examples/specdec_bench/data/speed/throughput_16k
+#
+# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH support;
+# on GB200/aarch64 use an aarch64 vLLM image new enough for DFLASH (validated on
+# a 0511 nightly). Override with: pipeline.task_0.slurm_config.container=<image>
+#
+# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+#          SLURM_PARTITION=batch \
+#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
+#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          NEMORUN_HOME=$PWD
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/specdec_bench.yaml \
+#          identity=$HOME/.ssh/id_ecdsa detach=True --yes
+
+job_name: Kimi-K2.5-NVFP4_DFLASH_specdec_bench
+
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+
+  task_0:
+    script: common/specdec_bench/run.sh
+    args:
+      # TODO: point at your trained + exported HF-format DFLASH draft checkpoint.
+      - --draft_model_dir /hf-local/nvidia/Kimi-K2.5-DFlash
+      - --speculative_algorithm DFLASH
+      - --engine VLLM
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --tp_size 4
+      - --ep_size 4
+      - --concurrency 32
+      - --output_length 1024
+      - --trust_remote_code
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/specdec_bench
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index 8ecd51f6f86..0bcfff14ad9 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -48,6 +48,11 @@ class SlurmConfig:
     gpus_per_node: int = 1
     time: str = "04:00:00"
     local: bool = False
+    # Slurm --segment=<N>: force the job's nodes into a single topology block.
+    # On a topology/block cluster (e.g. GB200 NVL72, where one block = one NVLink
+    # domain) set this to the node count to keep all nodes in one NVL72 so
+    # inter-node traffic rides NVLink. None = let the scheduler place freely.
+    segment: Optional[int] = None
 
 
 @run.cli.factory
@@ -68,6 +73,7 @@ def slurm_factory(
     srun_args: list[str] = ["--no-container-mount-home"],
     array: Optional[str] = None,
     time: str = "04:00:00",
+    segment: Optional[int] = None,
 ) -> SlurmConfig:
     """Generic Slurm factory — configure via environment variables or CLI overrides."""
     return SlurmConfig(
@@ -84,4 +90,5 @@ def slurm_factory(
         srun_args=srun_args,
         array=array,
         time=time,
+        segment=segment,
     )

From 6b8e784fa539dcbedfd9712cb204bc6c4d33986f Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 04:10:38 +0000
Subject: [PATCH 02/14] iterable dataset to map-style dataset

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/eagle_utils.py  |   8 +-
 examples/speculative_decoding/main.py         |  10 +-
 .../plugins/hf_streaming_dataset.py           | 405 ++++++------------
 .../speculative/plugins/hf_training_args.py   |   3 +
 .../test_eagle_streaming.py                   |   3 +-
 .../plugins/test_hf_streaming_dataset.py      | 240 +++++------
 .../common/eagle3/train_eagle_streaming.sh    |  10 +-
 .../hf_streaming_dflash_multi_node.yaml       |  24 +-
 8 files changed, 272 insertions(+), 431 deletions(-)

diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
index f9675e54161..bcdcf15e8c2 100644
--- a/examples/speculative_decoding/eagle_utils.py
+++ b/examples/speculative_decoding/eagle_utils.py
@@ -88,14 +88,16 @@ def make_speculative_data_module(
         ds = load_dataset("json", data_files=data_args.data_path, split="train")
         if data_args.sample_size > 0:
             ds = ds.select(range(data_args.sample_size))
+        # Map-style dataset: each rank fetches its own DistributedSampler shard.
+        # Fetch concurrency comes from the DataLoader's num_workers, not a config knob;
+        # shuffling/order is the sampler's job, so no seed is threaded here.
+        # ``server_urls`` accepts a comma-separated string for multi-server fan-out.
         streaming_cfg = EagleVllmStreamingConfig(
-            server_url=data_args.streaming_server_url,
+            server_urls=data_args.streaming_server_url,
             model=data_args.streaming_model_name,
             shared_storage_root=data_args.streaming_shared_storage_path,
             max_seq_len=train_len,
             answer_only_loss=answer_only_loss,
-            prefetch=data_args.streaming_prefetch,
-            seed=seed,
         )
         train_dataset = EagleVllmStreamingDataset(
             entries=ds,
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
index 9b7a9f44d2e..4405bf0cd90 100644
--- a/examples/speculative_decoding/main.py
+++ b/examples/speculative_decoding/main.py
@@ -278,12 +278,12 @@ def train():
     ):
         callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps))
     if recipe.data.mode == "streaming":
-        # Skip-on-resume happens inside the dataset (no re-fetch from server);
-        # disable HF Trainer's own data skip so the offset isn't applied twice.
-        from modelopt.torch.speculative.plugins.hf_streaming_dataset import StreamingResumeCallback
-
+        # The streaming dataset is map-style, so HF Trainer's default resume would
+        # fast-forward by re-iterating (= re-fetching) every consumed batch just to
+        # discard it, hammering the server. Disable the data skip: on resume, weights/
+        # optimizer/global_step still restore from the checkpoint; only the data order
+        # restarts from the top (acceptable for single-epoch streaming).
         training_args.ignore_data_skip = True
-        callbacks.append(StreamingResumeCallback())
 
     trainer = EagleTrainerWithAccLog(
         model=model,
diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
index 31adbc96bf4..6b050f00add 100644
--- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
+++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
@@ -13,11 +13,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Streaming datasets that fetch per-sample hidden states from a running inference server.
-
-The base class :class:`StreamingDataset` owns all the backend-/algorithm-
-agnostic plumbing: threading, queue, tokenization, the bounded sliding-window
-producer, loss_mask alignment, and HTTP-client lifecycle. Concrete subclasses
+"""Map-style datasets that fetch per-sample hidden states from a running inference server.
+
+This is the streaming sibling of :class:`OfflineSupervisedDataset`: instead of
+reading a pre-dumped ``.pt`` file in ``__getitem__``, it fetches the per-sample
+hidden states from a live inference server over HTTP. It is a plain
+``torch.utils.data.Dataset`` (map-style), so DDP sharding is handled the standard
+way -- HF Trainer wraps it in a ``DistributedSampler`` and each rank's DataLoader
+calls ``__getitem__`` only for that rank's indices. Each rank therefore fetches
+**only its own shard** (no rank-0 funnel, no broadcast); aggregate read bandwidth
+scales with the number of trainer ranks.
+
+Fetch concurrency comes from the DataLoader's ``num_workers`` (each worker process
+issues one blocking request at a time); there is no in-process producer thread.
+Keep ``num_workers`` modest and bounded so the per-server in-flight request count
+(``ranks-hitting-a-server x num_workers``) stays near the server's ``max_num_seqs``
+-- flooding a cold NVFP4 MoE server can stall a worker past vLLM's execute-model
+timeout and kill EngineCore.
+
+The base class :class:`StreamingDataset` owns the backend-/algorithm-agnostic
+plumbing: tokenization, the resample-on-failure ``__getitem__`` loop, the
+consecutive-failure circuit breaker, and loss_mask alignment. Concrete subclasses
 specialize along two axes:
 
 - **Backend** (how to talk to the server, how to decode the response): override
@@ -25,22 +41,14 @@
 - **Algorithm** (how to shape the per-sample dict for the trainer): override
   :meth:`_format`.
 
-:class:`EagleVllmStreamingDataset` is currently the only concrete
-combination (Eagle algorithm × vLLM backend); future combinations live as
-sibling subclasses.
-
-Requires ``dataloader_num_workers=0``: multiple workers would each spawn their
-own asyncio loop and issue duplicate requests against the server.
+:class:`EagleVllmStreamingDataset` is currently the only concrete combination
+(Eagle algorithm x vLLM backend); future combinations live as sibling subclasses.
 """
 
 from __future__ import annotations
 
-import asyncio
 import contextlib
 import os
-import queue
-import random
-import threading
 from pathlib import Path
 from typing import TypedDict
 
@@ -48,17 +56,13 @@
 import torch
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from safetensors import safe_open
-from torch.utils.data import IterableDataset, get_worker_info
-from transformers import TrainerCallback
+from torch.utils.data import Dataset
 from transformers.trainer_pt_utils import LabelSmoother
 
-from modelopt.torch.utils import distributed as dist_utils
 from modelopt.torch.utils import print_rank_0, warn_rank_0
 
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 
-_SENTINEL = object()
-
 
 def _tokenize_with_loss_mask(
     tokenizer,
@@ -104,33 +108,33 @@ def _tokenize_with_loss_mask(
 class StreamingConfig(BaseModel):
     """Static tuning knobs for :class:`StreamingDataset`.
 
-    Bundles the rarely-changing settings (loss masking, concurrency, HTTP timeout)
-    so the dataset ctor takes only ``entries`` + ``tokenizer`` + this config.
+    Bundles the rarely-changing settings (loss masking, HTTP timeout) so the dataset
+    ctor takes only ``entries`` + ``tokenizer`` + this config.
     """
 
     model_config = ConfigDict(extra="forbid")
 
     answer_only_loss: bool = False
-    prefetch: int = Field(default=64, ge=1)
     request_timeout: float = Field(default=600.0, gt=0)
     # Token-level cap applied during tokenization (right-truncation). Must hold
     # ``max_seq_len <= vllm.max_model_len``. ``None`` disables truncation.
     max_seq_len: int | None = None
-    # Must be identical on every rank — the dataset shuffles with this seed then
-    # stripes by rank, so equal seeds are required for the partition to be disjoint.
-    seed: int = 0
-    # Circuit breaker: raise after this many consecutive _fetch failures so a dead
-    # server doesn't silently drain the corpus.
+    # Circuit breaker: raise after this many consecutive _fetch failures (per worker
+    # process) so a dead server doesn't silently resample the whole corpus.
     fail_after_consecutive_skips: int = Field(default=16, ge=1)
 
 
-class StreamingDataset(IterableDataset):
-    """Base class: stream per-sample hidden states from a running inference server.
+class StreamingDataset(Dataset):
+    """Base class: map-style dataset that streams per-sample hidden states from a server.
 
     Backend- and algorithm-agnostic; subclasses implement :meth:`_fetch` (backend) and
     :meth:`_format` (algorithm). The dict shape exchanged between them is the
     algorithm-level contract, declared as a ``TypedDict`` in :attr:`fetch_payload_cls`
     and validated against the actual ``_fetch`` output on every sample.
+
+    ``__getitem__`` must always return a valid sample for the sampler's index, so it
+    resamples forward through the corpus on an unfit entry or a fetch failure rather
+    than skipping (a skip would shrink the batch and desync DDP).
     """
 
     config_cls: type[StreamingConfig] = StreamingConfig
@@ -145,217 +149,78 @@ def __init__(
         tokenizer,
         config: StreamingConfig | None = None,
     ):
-        """Hold the *full* corpus on every rank; fetch lazily, rank 0 only.
+        """Hold the full corpus; fetch lazily, per index, in ``__getitem__``.
 
-        DDP sharding is delegated to Accelerate's ``DataLoaderDispatcher``: rank 0
-        consumes the dataset and broadcasts each batch; non-zero ranks rely on
-        :meth:`__iter__`'s rank guard. The corpus is held in full on every rank --
-        the dispatcher reads only rank 0's stream, so sharding here would just
-        shrink that view. Shuffling with ``config.seed`` runs on every rank so
-        the order is reproducible regardless of which rank ends up fetching.
+        DDP sharding is handled by HF Trainer's ``DistributedSampler``: each rank's
+        DataLoader requests only its own indices, so each rank fetches only its
+        shard. The corpus order is left as given -- the sampler shuffles indices
+        (seeded by ``training_args.seed``), so no shuffle is needed here.
 
         Args:
             entries: Untokenized per-sample dicts from the input jsonl. Schema is
-                subclass-defined (see :meth:`_tokenize_entry`); passed through to :meth:`_fetch`.
+                subclass-defined (see :meth:`_tokenize_entry`); passed to :meth:`_fetch`.
             tokenizer: HF tokenizer; used for client-side tokenization and the
                 server/client loss-mask alignment in :meth:`_fetch`.
-            config: Tuning knobs (prefetch, timeout, seed, ...); defaults to
+            config: Tuning knobs (timeout, answer_only_loss, ...); defaults to
                 ``self.config_cls()``. See :class:`StreamingConfig`.
         """
         if not entries:
             raise ValueError("entries is empty")
         self.tokenizer = tokenizer
         self.config = config if config is not None else self.config_cls()
-        # One-shot, consumed by the next __iter__.
-        self._resume_skip = 0
-
-        indices = list(range(len(entries)))
-        random.Random(self.config.seed).shuffle(indices)
-        self.entries = [entries[i] for i in indices]
-        rank, world = dist_utils.rank(), dist_utils.size()
-        print_rank_0(
-            f"[{type(self).__name__}] rank {rank}/{world}: "
-            f"holds {len(self.entries)} entries (full corpus; rank 0 fetches)"
-        )
+        # Materialize to a plain list so DataLoader worker processes fork it cheaply.
+        self.entries = list(entries)
+        # Per-process consecutive-failure counter for the circuit breaker. Reset to 0
+        # on every successful fetch; tripped only by fetch failures (not unfit entries).
+        self._consecutive_fail = 0
+        print_rank_0(f"[{type(self).__name__}] map-style dataset over {len(self.entries)} entries")
 
     def __len__(self) -> int:
         return len(self.entries)
 
-    def set_resume_position(self, skip: int) -> None:
-        """Drop the first ``skip`` entries on the next ``__iter__`` without fetching.
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        """Tokenize -> fetch -> format the sample at ``idx``, resampling on miss.
 
-        One-shot; cleared once iteration starts. Used by
-        :class:`StreamingResumeCallback` on HF Trainer checkpoint resume so the
-        server is not re-queried for already-consumed samples.
+        Always returns a valid sample. An unfit entry (tokenization yields nothing) or
+        a fetch failure causes a forward probe to the next index; fetch failures bump
+        the circuit breaker, which raises once ``fail_after_consecutive_skips`` is hit.
         """
-        self._resume_skip = skip
-
-    @staticmethod
-    def _verify_accelerate_dispatcher() -> None:
-        """Raise if Accelerate is initialized for DDP with ``dispatch_batches=False``.
-
-        Best-effort: no-op when Accelerate isn't installed/initialized or in single-process.
-        """
-        try:
-            from accelerate.state import AcceleratorState
-        except ImportError:
-            return
-        if not AcceleratorState._shared_state:
-            return
-        state = AcceleratorState()
-        if getattr(state, "num_processes", 1) <= 1:
-            return
-        # Field moved to ``dataloader_config`` in newer Accelerate; check both.
-        dispatch = getattr(state, "dispatch_batches", None)
-        if dispatch is None:
-            dl_cfg = getattr(state, "dataloader_config", None)
-            if dl_cfg is not None:
-                dispatch = getattr(dl_cfg, "dispatch_batches", None)
-        if dispatch is False:
-            raise RuntimeError(
-                "StreamingDataset requires Accelerate's DataLoaderDispatcher "
-                "(dispatch_batches=True); got False — non-zero ranks would receive no data."
-            )
-
-    def __iter__(self):
-        # IterableDataset with DataLoader workers > 0 would spawn one asyncio loop
-        # per worker, each issuing the full request set — silent Nx duplication
-        # against the server. Fail loud instead.
-        if get_worker_info() is not None:
-            raise RuntimeError(
-                f"{type(self).__name__} requires dataloader_num_workers=0; "
-                "multiple workers would each spawn an asyncio loop and duplicate requests."
-            )
-        # Without dispatch_batches the rank-0 guard below would silently starve
-        # non-zero ranks; fail loud instead.
-        self._verify_accelerate_dispatcher()
-        # Only rank 0 fetches; non-zero ranks receive batches via the dispatcher's broadcast.
-        if dist_utils.rank() != 0:
-            return
-        # Fresh producer per __iter__ call so re-iteration (which shouldn't
-        # happen in 1-epoch streaming) at least doesn't deadlock.
-        q: queue.Queue = queue.Queue(maxsize=self.config.prefetch)
-        stop = threading.Event()
-        skip = self._resume_skip
-        self._resume_skip = 0  # one-shot
-        entries = self.entries[skip:] if skip else self.entries
-
-        def run():
+        n = len(self.entries)
+        for offset in range(n):
+            entry = self.entries[(idx + offset) % n]
+            sample = self._tokenize_entry(entry)
+            if sample is None:
+                continue  # entry unfit pre-fetch; server not at fault, try the next one
             try:
-                asyncio.run(self._produce(q, stop, entries))
+                fetched = self._fetch(sample)
             except Exception as e:
-                q.put(e)  # surface to consumer
-            finally:
-                q.put(_SENTINEL)
-
-        thread = threading.Thread(target=run, daemon=True)
-        thread.start()
-
-        try:
-            while True:
-                item = q.get()
-                if item is _SENTINEL:
-                    break
-                if isinstance(item, Exception):
-                    raise item
-                yield item
-        finally:
-            stop.set()
-            # Drain any leftover items so producer can exit
-            with contextlib.suppress(queue.Empty):
-                while True:
-                    q.get_nowait()
-
-    async def _produce(self, q: queue.Queue, stop: threading.Event, entries):
-        """Stream ``entries`` through a sliding window of at most ``prefetch`` in-flight tasks.
-
-        Counter is local (single writer); ``_process`` tasks report outcome via return value.
-        The circuit breaker has *batch-level* (not per-task) granularity: when
-        ``asyncio.wait(FIRST_COMPLETED)`` returns several tasks in the same loop turn,
-        ``consecutive_skips`` reflects set-iteration order over ``done`` -- sufficient
-        for "detect a dead server" but not strict temporal ordering.
-
-        Args:
-            q: Bounded queue drained by :meth:`__iter__`; full queue backpressures fetching.
-            stop: Set by the consumer to request shutdown; checked between samples.
-            entries: Resume-adjusted slice of ``self.entries`` to fetch this iteration.
-        """
-        timeout = httpx.Timeout(self.config.request_timeout, connect=10.0)
-        threshold = self.config.fail_after_consecutive_skips
-        consecutive_skips = 0
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            pending: set[asyncio.Task] = set()
-            entries_iter = iter(entries)
-            exhausted = False
-            try:
-                while not stop.is_set():
-                    while len(pending) < self.config.prefetch and not exhausted:
-                        try:
-                            entry = next(entries_iter)
-                        except StopIteration:
-                            exhausted = True
-                            break
-                        pending.add(asyncio.create_task(self._process(client, entry, q, stop)))
-                    if not pending:
-                        break
-                    done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
-                    for task in done:
-                        outcome = task.result()  # re-raises unexpected errors
-                        if outcome is True:
-                            consecutive_skips = 0
-                        elif outcome is False:
-                            consecutive_skips += 1
-                        # None -> entry unfit pre-fetch; server not at fault
-                    if consecutive_skips >= threshold:
-                        raise RuntimeError(
-                            f"{consecutive_skips} consecutive _fetch failures "
-                            f"in {type(self).__name__}; server likely down."
-                        )
-            finally:
-                for task in pending:
-                    task.cancel()
-                if pending:
-                    await asyncio.gather(*pending, return_exceptions=True)
-
-    async def _process(
-        self,
-        client: httpx.AsyncClient,
-        entry: dict,
-        q: queue.Queue,
-        stop: threading.Event,
-    ) -> bool | None:
-        """Tokenize -> fetch -> format -> enqueue.
-
-        Returns True on enqueue, False on fetch failure (bumps breaker), None
-        when the entry is unfit pre-fetch (no breaker effect).
-        """
-        if stop.is_set():
-            return None
-        sample = await asyncio.to_thread(self._tokenize_entry, entry)
-        if sample is None:
-            return None
-        try:
-            fetched = await self._fetch(client, sample)
-        except Exception as e:
-            warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}")
-            return False
-        if fetched is None:
-            return False
-        if self.fetch_payload_cls is not None:
-            # ``__required_keys__`` is a TypedDict runtime attribute mypy doesn't
-            # track on ``type``; the assignment site guarantees it's a TypedDict.
-            required: frozenset[str] = self.fetch_payload_cls.__required_keys__  # type: ignore[attr-defined]
-            missing = required - set(fetched)
-            if missing:
-                raise RuntimeError(
-                    f"{type(self).__name__}._fetch missing required keys {missing}; "
-                    f"{self.fetch_payload_cls.__name__} requires "
-                    f"{set(required)}, got {set(fetched)}"
-                )
-        data = self._format(fetched)
-        # Blocking put -> backpressure when trainer is slow.
-        await asyncio.to_thread(q.put, data)
-        return True
+                warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}")
+                fetched = None
+            if fetched is None:
+                self._consecutive_fail += 1
+                if self._consecutive_fail >= self.config.fail_after_consecutive_skips:
+                    raise RuntimeError(
+                        f"{self._consecutive_fail} consecutive _fetch failures in "
+                        f"{type(self).__name__}; server likely down."
+                    )
+                continue  # resample forward
+            self._consecutive_fail = 0
+            if self.fetch_payload_cls is not None:
+                # ``__required_keys__`` is a TypedDict runtime attribute mypy doesn't
+                # track on ``type``; the assignment site guarantees it's a TypedDict.
+                required: frozenset[str] = self.fetch_payload_cls.__required_keys__  # type: ignore[attr-defined]
+                missing = required - set(fetched)
+                if missing:
+                    raise RuntimeError(
+                        f"{type(self).__name__}._fetch missing required keys {missing}; "
+                        f"{self.fetch_payload_cls.__name__} requires "
+                        f"{set(required)}, got {set(fetched)}"
+                    )
+            return self._format(fetched)
+        raise RuntimeError(
+            f"{type(self).__name__}: no fetchable sample found in the entire corpus "
+            f"({n} entries) starting at index {idx}."
+        )
 
     def _tokenize_entry(self, entry: dict) -> dict | None:
         """Tokenize a single entry.
@@ -382,14 +247,14 @@ def _tokenize_entry(self, entry: dict) -> dict | None:
             "loss_mask": loss_mask,
         }
 
-    async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> dict | None:
+    def _fetch(self, sample: dict) -> dict | None:
         """Backend hook: send the request and decode the server's response.
 
-        Override in subclass. Any scratch resources (per-request files, mmap'd
-        buffers) must be released before returning.
+        Override in subclass. Synchronous (called from a DataLoader worker). Any
+        scratch resources (per-request files, mmap'd buffers) must be released before
+        returning.
 
         Args:
-            client: Shared async HTTP client owned by :meth:`_produce`.
             sample: :meth:`_tokenize_entry` output:
                 ``{"cid": str, "token_ids": list[int], "loss_mask": LongTensor[seq]}``.
 
@@ -431,16 +296,24 @@ class EagleFetchPayload(TypedDict):
 class EagleVllmStreamingConfig(StreamingConfig):
     """Adds vLLM endpoint info on top of :class:`StreamingConfig`."""
 
-    server_url: str
+    # One or more vLLM endpoints; fetches round-robin across them so a single fetcher
+    # can spread load over several server replicas. Accepts a list or a single
+    # (optionally comma-separated) string.
+    server_urls: list[str]
     model: str
-    # Allowlist for ``hidden_states_path`` returned by the server. Must match the
-    # connector's ``shared_storage_path``; out-of-tree paths are rejected.
+    # Allowlist for ``hidden_states_path`` returned by the server. Must match (or be a
+    # parent of) the connector's ``shared_storage_path``; out-of-tree paths are rejected.
     shared_storage_root: str
 
-    @field_validator("server_url")
+    @field_validator("server_urls", mode="before")
     @classmethod
-    def _strip_trailing_slash(cls, v: str) -> str:
-        return v.rstrip("/")
+    def _normalize_urls(cls, v):
+        if isinstance(v, str):
+            v = v.split(",")
+        urls = [u.strip().rstrip("/") for u in v if u and str(u).strip()]
+        if not urls:
+            raise ValueError("server_urls must contain at least one non-empty URL")
+        return urls
 
     @field_validator("shared_storage_root")
     @classmethod
@@ -449,7 +322,7 @@ def _resolve_root(cls, v: str) -> str:
 
 
 class EagleVllmStreamingDataset(StreamingDataset):
-    """Eagle (algorithm) × vLLM (backend).
+    """Eagle (algorithm) x vLLM (backend).
 
     Talks to a ``vllm serve`` instance configured with the
     ``ExampleHiddenStatesConnector`` KV-transfer connector (the server dumps captured
@@ -467,13 +340,38 @@ def __init__(
         tokenizer,
         config: EagleVllmStreamingConfig,
     ):
-        """Same as the base; ``config`` must include ``server_url`` and ``model``."""
+        """Same as the base; ``config`` must include ``server_urls`` and ``model``."""
         super().__init__(entries=entries, tokenizer=tokenizer, config=config)
         self.config: EagleVllmStreamingConfig = config
 
-    async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> EagleFetchPayload | None:
-        r = await client.post(
-            f"{self.config.server_url}/v1/completions",
+    def _client(self) -> httpx.Client:
+        """Lazily build a per-process HTTP client and round-robin cursor.
+
+        DataLoader workers are forked processes; httpx connection pools must not be
+        shared across a fork, so each process gets its own client (and its own
+        round-robin cursor over ``server_urls``), keyed by PID.
+        """
+        pid = os.getpid()
+        if getattr(self, "_client_pid", None) != pid:
+            self._http = httpx.Client(
+                timeout=httpx.Timeout(self.config.request_timeout, connect=10.0)
+            )
+            self._client_pid = pid
+            self._rr = 0
+        return self._http
+
+    def _next_url(self) -> str:
+        """Round-robin the next server URL (per-process cursor)."""
+        urls = self.config.server_urls
+        url = urls[self._rr % len(urls)]
+        self._rr += 1
+        return url
+
+    def _fetch(self, sample: dict) -> EagleFetchPayload | None:
+        client = self._client()
+        url = self._next_url()
+        r = client.post(
+            f"{url}/v1/completions",
             json={
                 "model": self.config.model,
                 "prompt": sample["token_ids"],
@@ -492,7 +390,7 @@ async def _fetch(self, client: httpx.AsyncClient, sample: dict) -> EagleFetchPay
                 f"[streaming] path outside shared_storage_root for {sample['cid']}: {path!r}"
             )
             return None
-        token_ids, hidden_states = await asyncio.to_thread(self._load_safetensors, path)
+        token_ids, hidden_states = self._load_safetensors(path)
         # Contract: the server tokenization is the client's pre-tokenized prompt
         # verbatim, plus at most one decode-step token at the tail (from
         # ``max_tokens=1``). Anything else (e.g. server-side BOS prepend, chat
@@ -573,36 +471,3 @@ def _format(self, fetched: EagleFetchPayload) -> dict[str, torch.Tensor]:
             "loss_mask": loss_mask,
             "labels": labels,
         }
-
-
-class StreamingResumeCallback(TrainerCallback):
-    """Fast-forward :class:`StreamingDataset` past consumed samples on resume.
-
-    Dispatcher pulls a *global* batch per micro-step, hence the ``world_size`` factor.
-    Requires ``training_args.ignore_data_skip=True``; round-trips only when
-    ``world_size`` and ``config.seed`` match the original run.
-    """
-
-    def on_train_begin(self, args, state, control, train_dataloader=None, **kwargs):
-        """Push the skip count into the dataset when resuming mid-training."""
-        if state.global_step <= 0 or train_dataloader is None:
-            return
-        ds = train_dataloader.dataset
-        if not hasattr(ds, "set_resume_position"):
-            return
-        if not getattr(args, "ignore_data_skip", False):
-            raise RuntimeError(
-                "StreamingResumeCallback requires ignore_data_skip=True to avoid "
-                "double-skipping on resume."
-            )
-        consumed = (
-            state.global_step
-            * args.per_device_train_batch_size
-            * dist_utils.size()
-            * args.gradient_accumulation_steps
-        )
-        ds.set_resume_position(consumed)
-        print_rank_0(
-            f"[StreamingResumeCallback] resuming at global_step={state.global_step}; "
-            f"skipping {consumed} entries"
-        )
diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py
index a65a3183a05..a9670ec1efd 100644
--- a/modelopt/torch/speculative/plugins/hf_training_args.py
+++ b/modelopt/torch/speculative/plugins/hf_training_args.py
@@ -62,6 +62,9 @@ class DataArguments(BaseModel):
     sample_size: int = -1
     streaming_server_url: str | None = None
     streaming_model_name: str | None = None
+    # Deprecated / no-op: the streaming dataset is map-style now, so fetch concurrency
+    # comes from the DataLoader's ``dataloader_num_workers``, not this knob. Kept so
+    # existing yamls that set ``data.streaming_prefetch`` still validate.
     streaming_prefetch: int = Field(default=64, ge=1)
     # Mirror of the vLLM connector's ``shared_storage_path``; trainer-side allowlist.
     streaming_shared_storage_path: str | None = None
diff --git a/tests/examples/speculative_decoding/test_eagle_streaming.py b/tests/examples/speculative_decoding/test_eagle_streaming.py
index 291aa0f7929..3c8f7573957 100644
--- a/tests/examples/speculative_decoding/test_eagle_streaming.py
+++ b/tests/examples/speculative_decoding/test_eagle_streaming.py
@@ -118,13 +118,12 @@ def test_streaming_eagle_training(
         f"data.streaming_server_url={server_url}",
         f"data.streaming_model_name={tiny_llama_path}",
         f"data.streaming_shared_storage_path={scratch}",
-        "data.streaming_prefetch=2",
         f"training.output_dir={output_dir}",
         "training.num_train_epochs=1",
         "training.learning_rate=1e-5",
         "training.training_seq_len=32",
         "training.save_steps=1",
-        "training.dataloader_num_workers=0",  # enforced by StreamingDataset
+        "training.dataloader_num_workers=0",  # map-style; 0 keeps this test single-process
         *_TINY_EAGLE_ARCH,
     ]
 
diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
index 27210ee7286..4d094171967 100644
--- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
+++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
@@ -13,13 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for StreamingDataset's DDP contract.
+"""Tests for the map-style StreamingDataset.
 
-We do not spin up real torch.distributed; instead we monkeypatch the helper that
-reads rank/world_size. Sharding itself is delegated to Accelerate's
-``DataLoaderDispatcher`` (every rank holds the full corpus; only rank 0 iterates).
-These tests check the corpus-handling and rank-0-only-iter properties on which
-that delegation relies.
+The dataset is a plain ``torch.utils.data.Dataset``: DDP sharding is HF Trainer's
+job (``DistributedSampler``), so there is no rank/dispatch logic to test here.
+These tests cover the ``__getitem__`` contract: resample-on-miss, the
+consecutive-failure circuit breaker, and the vLLM wire-format -> batch-dict chain.
 """
 
 from pathlib import Path
@@ -30,7 +29,7 @@
 import safetensors.torch
 import torch
 
-# hf_streaming_dataset imports TrainerCallback / LabelSmoother at module scope.
+# hf_streaming_dataset imports LabelSmoother at module scope.
 pytest.importorskip("transformers")
 
 from modelopt.torch.speculative.plugins import hf_streaming_dataset
@@ -47,133 +46,100 @@ def _entries(n: int) -> list[dict]:
     return [{"id": i} for i in range(n)]
 
 
-@pytest.fixture
-def patch_dist(monkeypatch):
-    """Return a setter; tests call it with (rank, world) to simulate a DDP rank.
-
-    Patches ``modelopt.torch.utils.distributed.rank/size`` as imported into the
-    streaming dataset module (``dist_utils``). The dataset reads these in
-    ``__init__`` for logging and in ``__iter__`` for the rank-0-only gate.
-    """
-
-    def _set(rank: int, world: int):
-        # ``is_master`` etc. call ``rank(group=...)`` / ``size(group=...)`` — match the signature.
-        monkeypatch.setattr(hf_streaming_dataset.dist_utils, "rank", lambda group=None: rank)
-        monkeypatch.setattr(hf_streaming_dataset.dist_utils, "size", lambda group=None: world)
-
-    return _set
-
-
-def _entry_ids(ds: StreamingDataset) -> list[int]:
-    return [e["id"] for e in ds.entries]
-
-
-@pytest.mark.parametrize("world", [1, 2, 3, 8])
-def test_every_rank_holds_full_corpus(patch_dist, world):
-    """Each rank must see all entries — Accelerate's dispatcher does the sharding,
-    so any per-rank pre-shard here would shrink rank 0's view to 1/N and break
-    ``max_steps``.
-    """
-    corpus = _entries(100)
-    for rank in range(world):
-        patch_dist(rank, world)
-        ds = StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=42))
-        assert sorted(_entry_ids(ds)) == list(range(100))
-
-
-def test_same_seed_same_order(patch_dist):
-    """The shuffle is what makes rank 0's fetch order deterministic across reruns."""
-    corpus = _entries(50)
-    patch_dist(0, 1)
-    a = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=7)))
-    b = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=7)))
-    assert a == b
-
-
-def test_different_seed_different_order(patch_dist):
-    """Sanity: changing the seed actually reshuffles (else seed is vacuous)."""
-    corpus = _entries(50)
-    patch_dist(0, 1)
-    a = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=1)))
-    b = _entry_ids(StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=2)))
-    assert a != b
-    assert sorted(a) == sorted(b)
-
-
-def test_non_rank_zero_iter_is_empty(patch_dist):
-    """Non-zero ranks must yield nothing on ``__iter__`` — their producer would burn
-    server requests that ``DataLoaderDispatcher`` would discard."""
-    corpus = _entries(8)
-    patch_dist(2, 4)
-    ds = StreamingDataset(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=0))
-    assert list(iter(ds)) == []
-
-
-def test_iter_rejects_dataloader_workers(patch_dist, monkeypatch):
-    """Iterating from within a DataLoader worker must raise — multiple workers would
-    each spawn an asyncio loop and N× the request load on the server."""
-    patch_dist(0, 1)
-    ds = StreamingDataset(_entries(4), tokenizer=MagicMock(), config=StreamingConfig(seed=0))
-    # Pretend we're inside a DataLoader worker.
-    monkeypatch.setattr(hf_streaming_dataset, "get_worker_info", lambda: MagicMock())
-    with pytest.raises(RuntimeError, match="dataloader_num_workers=0"):
-        next(iter(ds))
-
-
-def test_empty_corpus_raises(patch_dist):
-    patch_dist(0, 1)
+def test_empty_corpus_raises():
     with pytest.raises(ValueError, match="entries is empty"):
         StreamingDataset([], tokenizer=MagicMock(), config=StreamingConfig())
 
 
-def test_set_resume_position_skips_entries_without_fetching(patch_dist):
-    """Resume should fast-forward inside the dataset without invoking _fetch.
+def test_len_matches_corpus():
+    ds = StreamingDataset(_entries(37), tokenizer=MagicMock(), config=StreamingConfig())
+    assert len(ds) == 37
 
-    Verifies the contract relied on by StreamingResumeCallback: skipped entries
-    are not sent to the server, so resume costs nothing on the inference side.
-    """
-    patch_dist(0, 1)
-    fetched_ids: list[int] = []
+
+def test_getitem_resamples_past_unfit_entries():
+    """An unfit entry (tokenize -> None) must not be returned; __getitem__ probes
+    forward to the next fetchable index and returns that instead."""
+    fetched_cids: list[int] = []
 
     class _Track(StreamingDataset):
         def _tokenize_entry(self, entry):
+            # Even ids are "unfit" (e.g. truncated away / missing fields).
+            if entry["id"] % 2 == 0:
+                return None
             return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
 
-        async def _fetch(self, client, sample):
-            fetched_ids.append(int(sample["cid"]))
+        def _fetch(self, sample):
+            fetched_cids.append(int(sample["cid"]))
+            return {"ok": True}
 
-    corpus = _entries(10)
-    ds = _Track(corpus, tokenizer=MagicMock(), config=StreamingConfig(seed=0, prefetch=2))
-    ds.set_resume_position(5)
-    list(ds)
+        def _format(self, fetched):
+            return {"sentinel": fetched_cids[-1]}
 
-    expected = {e["id"] for e in ds.entries[5:]}
-    assert set(fetched_ids) == expected
-    # _resume_skip is one-shot
-    assert ds._resume_skip == 0
+    ds = _Track(_entries(10), tokenizer=MagicMock(), config=StreamingConfig())
+    # idx 0 is unfit -> resamples forward to idx 1.
+    out = ds[0]
+    assert out == {"sentinel": 1}
+    assert fetched_cids == [1]
+    # An already-fit index is returned directly.
+    assert ds[3] == {"sentinel": 3}
 
 
-def test_circuit_breaker_trips_on_consecutive_fetch_failures(patch_dist):
-    """When _fetch keeps failing, the producer raises after the threshold so the
-    trainer sees a clear error instead of a silent empty epoch."""
-    patch_dist(0, 1)
+def test_circuit_breaker_trips_on_consecutive_failures():
+    """When _fetch keeps failing, __getitem__ raises after the threshold instead of
+    silently resampling the whole corpus."""
     threshold = 3
 
     class _AlwaysFails(StreamingDataset):
-        # Bypass tokenization so we don't need a real tokenizer.
         def _tokenize_entry(self, entry):
             return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
 
-        async def _fetch(self, client, sample):
+        def _fetch(self, sample):
             raise RuntimeError("simulated server failure")
 
     ds = _AlwaysFails(
         _entries(20),
         tokenizer=MagicMock(),
-        config=StreamingConfig(seed=0, prefetch=2, fail_after_consecutive_skips=threshold),
+        config=StreamingConfig(fail_after_consecutive_skips=threshold),
     )
     with pytest.raises(RuntimeError, match="consecutive _fetch failures"):
-        list(ds)
+        ds[0]
+
+
+def test_fetch_returning_none_exhausts_then_raises():
+    """If every entry's fetch yields None (e.g. all rejected), __getitem__ raises a
+    clear 'no fetchable sample' error rather than hanging or returning junk."""
+
+    class _AllNone(StreamingDataset):
+        def _tokenize_entry(self, entry):
+            return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
+
+        def _fetch(self, sample):
+            return None
+
+    ds = _AllNone(
+        _entries(4),
+        tokenizer=MagicMock(),
+        config=StreamingConfig(fail_after_consecutive_skips=100),
+    )
+    with pytest.raises(RuntimeError, match="no fetchable sample"):
+        ds[0]
+
+
+def test_server_urls_normalization():
+    """server_urls accepts a single string, a comma-separated string, or a list, and
+    strips trailing slashes."""
+
+    def _urls(v):
+        cfg = EagleVllmStreamingConfig(
+            server_urls=v, model="m", shared_storage_root=str(Path.cwd())
+        )
+        return cfg.server_urls
+
+    assert _urls("http://a:8000/") == ["http://a:8000"]
+    assert _urls("http://a:8000, http://b:8000/") == ["http://a:8000", "http://b:8000"]
+    assert _urls(["http://a:8000", "http://b:8000"]) == ["http://a:8000", "http://b:8000"]
+    with pytest.raises(ValueError, match="at least one non-empty URL"):
+        EagleVllmStreamingConfig(server_urls="", model="m", shared_storage_root=".")
 
 
 def _write_canned_safetensors(path: Path, seq: int, n_layers: int, hidden: int) -> None:
@@ -196,14 +162,23 @@ def _tokenizer_returning(seq: int) -> MagicMock:
     return tok
 
 
-def test_eagle_vllm_dataset_end_to_end(tmp_path, patch_dist, monkeypatch):
+def _patch_sync_client(monkeypatch, handler):
+    """Route the dataset's per-process httpx.Client through a MockTransport handler."""
+    real_client = httpx.Client
+
+    def mock_client(*args, **kwargs):
+        kwargs["transport"] = httpx.MockTransport(handler)
+        return real_client(*args, **kwargs)
+
+    monkeypatch.setattr(hf_streaming_dataset.httpx, "Client", mock_client)
+
+
+def test_eagle_vllm_dataset_end_to_end(tmp_path, monkeypatch):
     """Drive EagleVllmStreamingDataset against an in-process mocked server.
 
-    Verifies that the wire-format → tensor → batch-dict chain produces dicts
-    matching what EagleOfflineDataCollator expects, and that scratch files
-    are cleaned up after each fetch.
+    Verifies the wire-format -> tensor -> batch-dict chain produces dicts matching
+    what EagleOfflineDataCollator expects, and that scratch files are cleaned up.
     """
-    patch_dist(0, 1)
     seq, n_layers, hidden = 8, 3, 16  # n_layers = 1 final + 2 aux
     scratch = tmp_path / "vllm_scratch"
     scratch.mkdir()
@@ -219,37 +194,25 @@ def handler(request: httpx.Request) -> httpx.Response:
             json={"kv_transfer_params": {"hidden_states_path": str(path)}},
         )
 
-    real_async_client = httpx.AsyncClient
-
-    def mock_async_client(*args, **kwargs):
-        kwargs["transport"] = httpx.MockTransport(handler)
-        return real_async_client(*args, **kwargs)
-
-    monkeypatch.setattr(hf_streaming_dataset.httpx, "AsyncClient", mock_async_client)
+    _patch_sync_client(monkeypatch, handler)
 
     n_entries = 4
     entries = [
-        {
-            "conversation_id": f"c-{i}",
-            "messages": [{"role": "user", "content": "x"}],
-        }
+        {"conversation_id": f"c-{i}", "messages": [{"role": "user", "content": "x"}]}
         for i in range(n_entries)
     ]
     ds = EagleVllmStreamingDataset(
         entries=entries,
         tokenizer=_tokenizer_returning(seq),
         config=EagleVllmStreamingConfig(
-            server_url="http://mock:8000",
+            server_urls="http://mock:8000",
             model="mock-model",
             shared_storage_root=str(scratch),
-            prefetch=2,
-            seed=0,
         ),
     )
 
-    batches = list(ds)
+    batches = [ds[i] for i in range(n_entries)]
 
-    assert len(batches) == n_entries
     expected_keys = {
         "input_ids",
         "base_model_hidden_states",
@@ -275,9 +238,9 @@ def mock_async_client(*args, **kwargs):
     assert list(scratch.iterdir()) == [], "scratch files must be unlinked after fetch"
 
 
-def test_path_outside_shared_storage_root_is_rejected(tmp_path, patch_dist, monkeypatch):
-    """Out-of-root path from server is not opened or unlinked."""
-    patch_dist(0, 1)
+def test_path_outside_shared_storage_root_is_rejected(tmp_path, monkeypatch):
+    """Out-of-root path from the server is not opened or unlinked; the fetch yields
+    None, so the single-entry corpus is exhausted and __getitem__ raises."""
     seq, n_layers, hidden = 8, 3, 16
     allowed = tmp_path / "allowed"
     allowed.mkdir()
@@ -292,26 +255,19 @@ def handler(request: httpx.Request) -> httpx.Response:
             json={"kv_transfer_params": {"hidden_states_path": str(forbidden)}},
         )
 
-    real_async_client = httpx.AsyncClient
-
-    def mock_async_client(*args, **kwargs):
-        kwargs["transport"] = httpx.MockTransport(handler)
-        return real_async_client(*args, **kwargs)
-
-    monkeypatch.setattr(hf_streaming_dataset.httpx, "AsyncClient", mock_async_client)
+    _patch_sync_client(monkeypatch, handler)
 
     ds = EagleVllmStreamingDataset(
         entries=[{"conversation_id": "c-0", "messages": [{"role": "user", "content": "x"}]}],
         tokenizer=_tokenizer_returning(seq),
         config=EagleVllmStreamingConfig(
-            server_url="http://mock:8000",
+            server_urls="http://mock:8000",
             model="mock-model",
             shared_storage_root=str(allowed),
             fail_after_consecutive_skips=100,
-            prefetch=1,
-            seed=0,
         ),
     )
 
-    assert list(ds) == []
+    with pytest.raises(RuntimeError, match="no fetchable sample"):
+        ds[0]
     assert forbidden.exists(), "rejected path must not be unlinked"
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index 4a8dc8bbacf..a65cbc2bc41 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -224,8 +224,11 @@ wait_vllm_ready() {
 
 # Run the trainer then export the HF checkpoint.
 #   $1 = streaming server base URL   $2 = CUDA_VISIBLE_DEVICES ("" -> all)
-# dataloader_num_workers must be 0: the streaming dataset owns one asyncio loop
-# per process; multiple workers would duplicate requests against the server.
+# The streaming dataset is map-style now, so fetch concurrency comes from the
+# DataLoader's workers (each worker = one in-flight request). STREAMING_NUM_WORKERS
+# sets that; keep it modest so (ranks-per-server x workers) stays near the server's
+# max_num_seqs (flooding a cold NVFP4 MoE server kills EngineCore). 0 disables
+# prefetch (serialized fetches) and is usually too slow.
 run_trainer_and_export() {
     local url="$1" cvd="$2"
     # Optional multi-node trainer routing (see dispatch section). Defaults keep
@@ -247,7 +250,8 @@ run_trainer_and_export() {
         data.streaming_server_url="$url" \
         data.streaming_model_name="$HF_MODEL_CKPT" \
         data.streaming_shared_storage_path="$SERVE_SCRATCH" \
-        training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; }
+        training.dataloader_num_workers="${STREAMING_NUM_WORKERS:-4}" \
+        || { echo "ERROR: trainer failed." >&2; return 1; }
 
     # Export only on the head trainer (machine_rank 0); non-head trainer nodes
     # would race writing the same export dir. The export reads the saved
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
index fb92ba11234..a2b44c0ea15 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -1,13 +1,25 @@
 # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
-# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve +
-# trainer split, same hardware reasoning — but trains a DFlash drafter instead of
-# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the
-# dflash recipe.
+# GB200/Blackwell (HSG). Multi-node sibling of hf_streaming_dflash.yaml — same
+# vLLM-serve + trainer split and same hardware reasoning, but scales the trainer
+# across MULTIPLE nodes (1 serve node + N trainer nodes doing multi-node DDP)
+# instead of the single trainer node in hf_streaming_dflash.yaml.
 #
 # Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
 # NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
 # ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4,
-# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes.
+# whole node), nodes 1..N = DFlash trainers (fake base), 4 GPUs each. This file
+# allocates 3 nodes (1 serve + 2 trainers); bump slurm_config.nodes/segment to
+# add more trainer nodes.
+#
+# Topology (see common/eagle3/train_eagle_streaming.sh header for the full
+# dispatch): node 0 serves; the head trainer (Slurm node 1, accelerate
+# machine_rank 0) publishes its IP via /scratchspace for accelerate's c10d
+# rendezvous, and every trainer node reads both the serve address and the
+# head-trainer address from /scratchspace. segment=<nodes> pins all nodes into
+# one NVL72 block so inter-node DDP traffic rides NVLink. NOTE: only global rank
+# 0 fetches hidden states from the single serve and broadcasts them to the rest
+# (DataLoaderDispatcher), so the single serve is the throughput ceiling — extra
+# trainer nodes scale effective batch / compute, not data-production throughput.
 #
 # How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
 # up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
@@ -39,7 +51,7 @@
 #          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
 #          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
 #          NEMORUN_HOME=$PWD
-#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
 #
 # The export lands in /scratchspace/export. To benchmark it, point

From cac937c339b0e40779168f22e7c127e2c3f043d3 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 06:29:53 +0000
Subject: [PATCH 03/14] multi node training

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/launch_train.sh | 16 +++++++--
 .../plugins/hf_streaming_dataset.py           | 34 +++++++++++++++----
 .../plugins/test_hf_streaming_dataset.py      | 24 +++++++++++++
 3 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
index fc623930767..3ff34b6ae9a 100755
--- a/examples/speculative_decoding/launch_train.sh
+++ b/examples/speculative_decoding/launch_train.sh
@@ -65,10 +65,22 @@ if [[ "$NUM_NODES" != "1" ]]; then
   # SLURM allocation reserves node 0 for something else, e.g. the streaming
   # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank).
   # Default to $SLURM_PROCID for the all-nodes-are-trainers case.
-  MULTI_NODE_ARGS="--num_processes $TOTAL_GPU \
+  # Canonical accelerate multi-node launch for a fixed Slurm allocation:
+  # --multi_gpu + static rendezvous via main_process_ip/port (-> MASTER_ADDR/PORT).
+  #
+  # --multi_gpu is REQUIRED: with 1 GPU/node, each node's local process count is
+  # num_processes/num_machines = 1, and without --multi_gpu accelerate treats a
+  # single local process as non-distributed -- it never sets WORLD_SIZE/RANK or
+  # forms the process group, so every node trains the full dataset as its own
+  # world=1 (no hang, no real DDP). --multi_gpu forces DistributedType.MULTI_GPU
+  # so the nodes rendezvous into one world=$TOTAL_GPU group.
+  #
+  # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which
+  # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip.
+  MULTI_NODE_ARGS="--multi_gpu \
+                   --num_processes $TOTAL_GPU \
                    --num_machines $NUM_NODES \
                    --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \
-                   --rdzv_backend c10d \
                    --main_process_ip $HEAD_NODE_IP \
                    --main_process_port 29500"
 fi
diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
index 6b050f00add..9c3655b76fe 100644
--- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
+++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
@@ -49,13 +49,14 @@
 
 import contextlib
 import os
+import time
 from pathlib import Path
 from typing import TypedDict
 
 import httpx
 import torch
 from pydantic import BaseModel, ConfigDict, Field, field_validator
-from safetensors import safe_open
+from safetensors import SafetensorError, safe_open
 from torch.utils.data import Dataset
 from transformers.trainer_pt_utils import LabelSmoother
 
@@ -63,6 +64,13 @@
 
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 
+# The vLLM connector writes the safetensors file asynchronously (writer thread pool)
+# and returns its path before the write is durably visible, so an immediate read can
+# race the writer. Retry the open with linear backoff until the file lands
+# (worst case ~_READ_RETRIES * (_READ_RETRIES+1)/2 * _READ_BACKOFF s).
+_READ_RETRIES = 10
+_READ_BACKOFF = 0.05  # seconds
+
 
 def _tokenize_with_loss_mask(
     tokenizer,
@@ -427,13 +435,25 @@ def _load_safetensors(path: str) -> tuple[torch.Tensor, torch.Tensor]:
         ``safe_open(..., framework="pt").get_tensor`` materializes an independent
         torch Tensor (not a view into the mmap'd file), so it is safe to unlink
         right after the ``with`` block exits.
+
+        Retries past the writer race (see ``_READ_RETRIES``): a missing file means
+        the write hasn't started; a ``SafetensorError`` means it's mid-write. Both
+        clear once the writer finishes, so back off and retry before giving up.
         """
-        with safe_open(path, framework="pt") as f:
-            token_ids = f.get_tensor("token_ids")
-            hidden_states = f.get_tensor("hidden_states")  # [seq, n_layers, hidden]
-        with contextlib.suppress(OSError):
-            os.unlink(path)
-        return token_ids, hidden_states
+        for attempt in range(_READ_RETRIES):
+            try:
+                with safe_open(path, framework="pt") as f:
+                    token_ids = f.get_tensor("token_ids")
+                    hidden_states = f.get_tensor("hidden_states")  # [seq, n_layers, hidden]
+                with contextlib.suppress(OSError):
+                    os.unlink(path)
+                return token_ids, hidden_states
+            except (FileNotFoundError, SafetensorError):  # noqa: PERF203 -- retry-on-race loop
+                if attempt == _READ_RETRIES - 1:
+                    raise
+                time.sleep(_READ_BACKOFF * (attempt + 1))
+        # Unreachable (the last attempt above re-raises); guards _READ_RETRIES < 1.
+        raise RuntimeError(f"_load_safetensors exhausted {_READ_RETRIES} retries for {path}")
 
     @staticmethod
     def _align_loss_mask(loss_mask: torch.Tensor, n: int) -> torch.Tensor:
diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
index 4d094171967..a0a37bc7afa 100644
--- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
+++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
@@ -271,3 +271,27 @@ def handler(request: httpx.Request) -> httpx.Response:
     with pytest.raises(RuntimeError, match="no fetchable sample"):
         ds[0]
     assert forbidden.exists(), "rejected path must not be unlinked"
+
+
+def test_load_safetensors_retries_past_writer_race(tmp_path, monkeypatch):
+    """The connector writes asynchronously, so an immediate read can race it;
+    _load_safetensors must retry past the transient FileNotFound/Safetensor error."""
+    seq, n_layers, hidden = 4, 2, 8
+    path = tmp_path / "late.safetensors"
+    _write_canned_safetensors(path, seq, n_layers, hidden)
+
+    calls = {"n": 0}
+    real_safe_open = hf_streaming_dataset.safe_open
+
+    def flaky_safe_open(p, framework):
+        calls["n"] += 1
+        if calls["n"] < 3:  # first 2 reads race the writer (file not ready yet)
+            raise FileNotFoundError(f"No such file or directory: {p}")
+        return real_safe_open(p, framework=framework)
+
+    monkeypatch.setattr(hf_streaming_dataset, "safe_open", flaky_safe_open)
+    monkeypatch.setattr(hf_streaming_dataset.time, "sleep", lambda *_: None)  # no real backoff
+
+    token_ids, hidden_states = EagleVllmStreamingDataset._load_safetensors(str(path))
+    assert calls["n"] == 3
+    assert hidden_states.shape == (seq, n_layers, hidden)

From 19551b621c1fbff3ee0f9182d8206a71e6f0b129 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 07:23:00 +0000
Subject: [PATCH 04/14] multinode serving

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 .../common/eagle3/train_eagle_streaming.sh    | 145 ++++++++++--------
 1 file changed, 81 insertions(+), 64 deletions(-)

diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index a65cbc2bc41..6f2875e525a 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -20,23 +20,24 @@
 # dumping to disk. Sibling of train_eagle.sh.
 #
 # Topology is chosen automatically from the Slurm allocation (the launcher yaml's
-# `nodes:` field); nemo_run runs this script once per node, so it branches on
-# $SLURM_NODEID:
-#   nodes == 1  -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of
-#                  the local GPUs (original single-node behavior).
-#   nodes == 2  -> split: node 0 runs vllm serve on all its GPUs, node 1 runs
-#                  the trainer on all its GPUs. Roles rendezvous through the
-#                  shared /scratchspace mount (node 0 publishes its serve
-#                  address; the trainer signals completion).
-#   nodes >= 3  -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1)
-#                  doing multi-node DDP. The head trainer (node 1, accelerate
-#                  machine_rank 0) publishes its IP for accelerate's c10d
-#                  rendezvous; all trainer nodes read both the serve address and
-#                  the head-trainer address from /scratchspace. NOTE: only global
-#                  rank 0 fetches hidden states from the single serve and
-#                  broadcasts to the rest (DataLoaderDispatcher), so the single
-#                  serve is the throughput ceiling — adding trainer nodes scales
-#                  effective batch / compute, not data-production throughput.
+# `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it
+# branches on $SLURM_NODEID:
+#   nodes == 1       -> co-located: vllm serve on $SERVE_GPU, trainer on the rest
+#                       of the local GPUs (original single-node behavior).
+#   nodes >= 2       -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent
+#                       vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1
+#                       are trainers doing multi-node DDP. SERVE_NODES defaults to 1
+#                       (1 serve + N trainers). Rendezvous over the shared
+#                       /scratchspace mount: each serve i publishes its address to
+#                       .serve_addr.i; the head trainer (first trainer node,
+#                       accelerate machine_rank 0) publishes its IP for accelerate's
+#                       rendezvous; trainers collect every serve address.
+#
+# The streaming dataset is map-style: HF Trainer's DistributedSampler shards the
+# corpus across all trainer ranks and each rank fetches ONLY its own shard,
+# round-robin across the SERVE_NODES replicas (data.streaming_server_url is the
+# comma-joined list). So trainer nodes scale effective batch / compute and
+# distribute the reads; serve nodes scale data-production throughput (~K x).
 #
 # Env vars (required):
 #   HF_MODEL_CKPT       Target model path. Used by both vllm serve (as the
@@ -48,6 +49,8 @@
 #                       default = [1,17,32] -> capture = [2,18,33,36].
 #
 # Env vars (optional):
+#   SERVE_NODES         multi-node only: number of dedicated serve replica nodes
+#                       (Slurm nodes 0..SERVE_NODES-1). default 1.
 #   SERVE_PORT          default 8765
 #   SERVE_GPU_MEM_UTIL  default 0.4 (single-node) / 0.9 (multi-node serve node)
 #   SERVE_READY_TIMEOUT seconds to wait for the server to come up. default 900
@@ -135,10 +138,15 @@ SCRIPT_ARGS=("$@")
 
 SERVE_PORT="${SERVE_PORT:-8765}"
 SERVE_READY_TIMEOUT="${SERVE_READY_TIMEOUT:-900}"
+# Number of dedicated serve replica nodes (multi-node only). Default 1.
+SERVE_NODES="${SERVE_NODES:-1}"
+# All serve replicas share one scratch dir; per-request safetensors files are keyed
+# by a unique vllm request id, so they don't collide across servers.
 SERVE_SCRATCH="/scratchspace/streaming_serve_scratch"
-SERVE_LOG="/scratchspace/vllm_serve.log"
-# Multi-node rendezvous over the shared /scratchspace mount (lustre, visible on
-# every node): node 0 publishes its address here, node 1 signals completion here.
+SERVE_LOG="/scratchspace/vllm_serve.log"   # serve nodes override with a per-node path
+# Rendezvous over the shared /scratchspace mount (lustre, visible on every node):
+# each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer
+# signals completion via DONE_FILE; trainers collect all serve addresses.
 SERVE_ADDR_FILE="/scratchspace/.serve_addr"
 DONE_FILE="/scratchspace/.training_done"
 SERVE_PID=""
@@ -153,6 +161,18 @@ cleanup() {
 
 gpus_on_node() { nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n1; }
 
+# Resolve a *routable* IP for this node (other nodes must be able to dial it).
+# `hostname -I` can list a link-local (169.254.x) or loopback address first, so
+# prefer the resolved Slurm node name, then the first non-loopback/non-link-local IP.
+#   $1 = optional override (e.g. SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP)
+resolve_routable_ip() {
+    local ip="$1"
+    [ -z "$ip" ] && ip=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
+    [ -z "$ip" ] && ip=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1)
+    [ -z "$ip" ] && ip=$(hostname -I | awk '{print $1}')
+    echo "$ip"
+}
+
 # Start vllm serve in the background. Sets SERVE_PID.
 #   $1 = bind host   $2 = tensor-parallel size   $3 = CUDA_VISIBLE_DEVICES ("" -> all)
 launch_vllm() {
@@ -306,71 +326,68 @@ PY
     wait_vllm_ready "http://${SERVE_HOST}:${SERVE_PORT}" || exit 1
     run_trainer_and_export "http://${SERVE_HOST}:${SERVE_PORT}" "$TRAIN_GPUS" || exit 1
 
-elif [ "$NODEID" -eq 0 ]; then
-    # ----------------------- multi-node: serve node ------------------------
-    SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}"   # dedicated node -> use most of it
-    SERVE_TP="${SERVE_TP:-$(gpus_on_node)}"            # default: all GPUs on this node
-    rm -f "$SERVE_ADDR_FILE" "$DONE_FILE"              # clear stale rendezvous state
+elif [ "$NODEID" -lt "$SERVE_NODES" ]; then
+    # ---------------------- multi-node: serve node(s) ----------------------
+    # Slurm nodes 0..SERVE_NODES-1 each run an independent vllm serve replica on
+    # their whole node and publish their address to ${SERVE_ADDR_FILE}.${NODEID}.
+    SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}"     # dedicated node -> use most of it
+    SERVE_TP="${SERVE_TP:-$(gpus_on_node)}"              # default: all GPUs on this node
+    SERVE_LOG="/scratchspace/vllm_serve.${NODEID}.log"  # per-node log (avoid collision)
+    rm -f "${SERVE_ADDR_FILE}.${NODEID}"                 # clear own stale address
+    [ "$NODEID" -eq 0 ] && rm -f "$DONE_FILE"            # node 0 clears the shared sentinel once
 
     trap cleanup INT TERM EXIT
     launch_vllm "0.0.0.0" "$SERVE_TP" ""
     wait_vllm_ready "http://127.0.0.1:${SERVE_PORT}" || exit 1
 
-    # Publish a *routable* address for the trainer node. `hostname -I` can list a
-    # link-local (169.254.x) or loopback address first, which is unreachable from
-    # the other node, so resolve the Slurm node name and fall back to the first
-    # non-link-local / non-loopback IP.
-    serve_addr="${SERVE_ADVERTISE_IP:-}"
-    if [ -z "$serve_addr" ]; then
-        serve_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
-    fi
-    if [ -z "$serve_addr" ]; then
-        serve_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1)
-    fi
-    [ -z "$serve_addr" ] && serve_addr=$(hostname -I | awk '{print $1}')
-    echo "$serve_addr" > "$SERVE_ADDR_FILE"
-    echo "Serve node published ${serve_addr}; holding the server up until the trainer signals done..."
+    serve_addr=$(resolve_routable_ip "${SERVE_ADVERTISE_IP:-}")
+    echo "$serve_addr" > "${SERVE_ADDR_FILE}.${NODEID}"
+    echo "Serve node ${NODEID}/${SERVE_NODES} published ${serve_addr}; holding up until training signals done..."
     while [ ! -f "$DONE_FILE" ]; do sleep 10; done
-    echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)."
+    echo "Training-done sentinel seen; serve node ${NODEID} exiting (EXIT trap stops vllm)."
 
-elif [ "$NODEID" -ge 1 ]; then
+else
     # -------------------- multi-node: trainer node(s) ----------------------
-    # Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which
-    # map to 0-based accelerate machine ranks (head trainer = SLURM node 1).
-    NUM_TRAINER_NODES=$(( NNODES - 1 ))
-    TRAINER_RANK=$(( NODEID - 1 ))
+    # Serve nodes are 0..SERVE_NODES-1; trainer nodes are SERVE_NODES..NNODES-1,
+    # mapping to 0-based accelerate machine ranks (head trainer = first trainer node).
+    NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES ))
+    TRAINER_RANK=$(( NODEID - SERVE_NODES ))
     TRAINER_ADDR_FILE="/scratchspace/.trainer_addr"
 
-    # Only the head trainer (rank 0) signals the serve node to release on exit;
-    # a non-head node exiting first must NOT tear the serve down early.
+    # Only the head trainer (rank 0) signals the serve nodes to release on exit;
+    # a non-head node exiting first must NOT tear the serves down early.
     if [ "$TRAINER_RANK" -eq 0 ]; then
         trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
         rm -f "$TRAINER_ADDR_FILE"                 # clear stale rendezvous state
     fi
 
-    echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..."
-    for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
-        [ -f "$SERVE_ADDR_FILE" ] && break
-        sleep 1
+    # Collect every serve replica's address and build the comma-joined URL list the
+    # streaming dataset round-robins across (one fetch per worker, spread over serves).
+    echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for ${SERVE_NODES} serve address(es)..."
+    URLS=""
+    for ((s = 0; s < SERVE_NODES; s++)); do
+        af="${SERVE_ADDR_FILE}.${s}"
+        for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
+            [ -f "$af" ] && break
+            sleep 1
+        done
+        [ -f "$af" ] || { echo "ERROR: serve node ${s} never published its address." >&2; exit 1; }
+        surl="http://$(cat "$af"):${SERVE_PORT}"
+        wait_vllm_ready "$surl" || exit 1
+        URLS="${URLS:+$URLS,}$surl"
     done
-    [ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; }
-    URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}"
-    wait_vllm_ready "$URL" || exit 1
+    echo "Trainer rank ${TRAINER_RANK} using serve URLs: ${URLS}"
 
     if [ "$NUM_TRAINER_NODES" -le 1 ]; then
-        # Original 1-serve + 1-trainer topology: single-node DDP, unchanged.
-        run_trainer_and_export "$URL" "" || exit 1
+        # 1 trainer node: single-node DDP (no accelerate multi-node routing).
+        run_trainer_and_export "$URLS" "" || exit 1
     else
         # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's
-        # c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse
-        # the serve node's IP-resolution logic (avoid link-local / loopback).
+        # rendezvous (port 29500); all trainer nodes read it and join.
         if [ "$TRAINER_RANK" -eq 0 ]; then
-            head_addr="${TRAINER_ADVERTISE_IP:-}"
-            [ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
-            [ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1)
-            [ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}')
+            head_addr=$(resolve_routable_ip "${TRAINER_ADVERTISE_IP:-}")
             echo "$head_addr" > "$TRAINER_ADDR_FILE"
-            echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous."
+            echo "Head trainer (rank 0) published ${head_addr} for accelerate rendezvous."
         else
             echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..."
             for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do
@@ -380,7 +397,7 @@ elif [ "$NODEID" -ge 1 ]; then
             [ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; }
         fi
         HEAD_IP=$(cat "$TRAINER_ADDR_FILE")
-        run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1
+        run_trainer_and_export "$URLS" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1
     fi
 fi
 

From c524abf1d33a5c6415fb571d68852daa30759563 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 07:24:09 +0000
Subject: [PATCH 05/14] qwen multinode streaming example

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 .../hf_streaming_eagle3_multi_node.yaml       | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
new file mode 100644
index 00000000000..aac5b71ecdf
--- /dev/null
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
@@ -0,0 +1,112 @@
+# EAGLE3 streaming speculative decoding pipeline for Qwen3-8B — MULTI-NODE.
+#
+# Multi-node sibling of hf_streaming_eagle3.yaml. Both the serve and trainer sides
+# scale across multiple nodes, with multiple GPUs per node. task_1 allocates 4
+# nodes x 2 GPUs: SERVE_NODES (=2) run independent vllm serve replicas (TP=2 each),
+# the remaining 2 run multi-node-DDP trainers (2 GPUs each -> world_size=4). Tune
+# the split via slurm_config.nodes/gpus_per_node + the SERVE_NODES env:
+#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
+#
+# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
+# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
+# K..N-1 are trainers. The head trainer publishes its IP for accelerate's
+# rendezvous; every trainer reads all serve addresses and joins the DDP group.
+#
+# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler
+# shards the corpus across ALL trainer ranks and each rank fetches ONLY its own
+# shard, round-robin across the K serve replicas (data.streaming_server_url is the
+# comma-joined list). Trainer nodes scale effective batch / compute and distribute
+# the lustre reads; serve nodes scale data-production throughput (~K x), lifting
+# the single-serve ceiling.
+#
+# 3-step pipeline:
+#   task_0: Build input conversations (jsonl)
+#   task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
+#   task_2: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml --yes
+
+job_name: Qwen3-8B_EAGLE3_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-8B
+
+  # Step 1: Build input conversations
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
+  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
+  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
+  #
+  # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32];
+  # vllm capture ids are those shifted by +1, plus the final layer:
+  #   [2, 18, 33] + [36] = [2, 18, 33, 36].
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+      - training.num_train_epochs=1
+      - eagle.eagle_use_torch_compile=false
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces: nemo_run emits `export FOO=value` without quotes, so a
+      # space-separated value would be split by the shell.
+      - EAGLE_CAPTURE_IDS: "[2,18,33,36]"
+      # Each serve node has 2 GPUs -> TP=2.
+      - SERVE_TP: "2"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES.
+      # Here 4 ranks (2 nodes x 2 GPU) x 4 / 2 serves = 8 concurrent per serve — fine
+      # for Qwen's max_num_seqs.
+      - STREAMING_NUM_WORKERS: "4"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 4
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Benchmark speculative decoding (VLLM backend)
+  task_2:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 1
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: vllm/vllm-openai:latest

From 8eb3525a7f1b281e4aa742bbedb1bb33faa20221 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 10:23:38 +0000
Subject: [PATCH 06/14] add kimi example

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 .../Kimi-K2.5/hf_streaming_dflash.yaml        | 15 ++--
 .../hf_streaming_dflash_multi_node.yaml       | 68 +++++++++++--------
 .../Kimi-K2.5/hf_streaming_eagle3.yaml        | 15 ++--
 3 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
index ff99ae62c7f..d16ca3822c7 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
@@ -81,10 +81,6 @@ pipeline:
       - model.trust_remote_code=true
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
-      # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4
-      # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker
-      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
-      - data.streaming_prefetch=8
       - training.output_dir=/scratchspace/dflash
       # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
       - training.training_seq_len=4096
@@ -106,6 +102,12 @@ pipeline:
       # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
       - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
       - SERVE_TP: "4"
+      # DataLoader workers per trainer rank = in-flight requests per rank. The
+      # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0),
+      # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS.
+      # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE
+      # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort.
+      - STREAMING_NUM_WORKERS: "1"
       # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
       # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
@@ -126,6 +128,11 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 2
+      # Pin the serve node + trainer node into one NVL72 block. Inter-node here is
+      # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this
+      # is a latency/locality nicety rather than a correctness requirement (cf. the
+      # multi_node examples, where cross-node trainer DDP makes segment essential).
+      segment: 2
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
index a2b44c0ea15..b645a0e428c 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -1,25 +1,35 @@
 # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
-# GB200/Blackwell (HSG). Multi-node sibling of hf_streaming_dflash.yaml — same
-# vLLM-serve + trainer split and same hardware reasoning, but scales the trainer
-# across MULTIPLE nodes (1 serve node + N trainer nodes doing multi-node DDP)
-# instead of the single trainer node in hf_streaming_dflash.yaml.
+# GB200/Blackwell (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of
+# hf_streaming_dflash.yaml: BOTH sides scale out. SERVE_NODES (=2) run independent
+# vllm serve replicas (TP=4 each, whole node); the remaining nodes run multi-node-DDP
+# trainers (4 GPUs each). This file allocates 4 nodes = 2 serve + 2 trainer
+# (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES:
+#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
 #
 # Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
 # NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
-# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4,
-# whole node), nodes 1..N = DFlash trainers (fake base), 4 GPUs each. This file
-# allocates 3 nodes (1 serve + 2 trainers); bump slurm_config.nodes/segment to
-# add more trainer nodes.
+# ~138 GB/GPU of weights) with NO cpu-offload. So each serve replica owns a whole
+# node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base).
 #
-# Topology (see common/eagle3/train_eagle_streaming.sh header for the full
-# dispatch): node 0 serves; the head trainer (Slurm node 1, accelerate
-# machine_rank 0) publishes its IP via /scratchspace for accelerate's c10d
-# rendezvous, and every trainer node reads both the serve address and the
-# head-trainer address from /scratchspace. segment=<nodes> pins all nodes into
-# one NVL72 block so inter-node DDP traffic rides NVLink. NOTE: only global rank
-# 0 fetches hidden states from the single serve and broadcasts them to the rest
-# (DataLoaderDispatcher), so the single serve is the throughput ceiling — extra
-# trainer nodes scale effective batch / compute, not data-production throughput.
+# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
+# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
+# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d
+# rendezvous; every trainer reads all K serve addresses and joins the DDP group.
+# segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
+# rides NVLink.
+#
+# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF
+# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each
+# rank fetches ONLY its own shard, round-robin across the K serve replicas
+# (data.streaming_server_url is the comma-joined list the script assembles). So
+# trainer nodes scale effective batch / compute and distribute the lustre reads;
+# serve nodes scale data-production throughput (~K x), lifting the single-serve
+# ceiling that bounded the old single-serve multi-node path.
+#
+# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below):
+# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
+# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server
+# stalls a worker past vLLM's execute-model timeout and kills EngineCore.
 #
 # How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
 # up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
@@ -81,9 +91,9 @@ pipeline:
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
-  # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node.
-  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh):
-  # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ.
+  # Step 2: Streaming DFlash training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each).
+  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
+  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -93,16 +103,12 @@ pipeline:
       - model.trust_remote_code=true
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
-      # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4
-      # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker
-      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
-      - data.streaming_prefetch=8
       - training.output_dir=/scratchspace/dflash
       # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
       - training.training_seq_len=4096
       - training.disable_tqdm=true
-      - training.num_train_epochs=1
       - training.ar_validate_steps=500000
+      - training.num_train_epochs=1
       - training.max_steps=500
       # See header: Kimi's template lacks {% generation %} tags; train on all tokens.
       - training.answer_only_loss=false
@@ -118,7 +124,15 @@ pipeline:
       # No spaces in values: nemo_run emits `export FOO=value` unquoted.
       # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
       - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi).
       - SERVE_TP: "4"
+      # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1:
+      # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS.
+      # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker
+      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
+      - STREAMING_NUM_WORKERS: "1"
       # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
       # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
@@ -138,8 +152,8 @@ pipeline:
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
       _factory_: "slurm_factory"
-      nodes: 3
-      segment: 3
+      nodes: 4
+      segment: 4
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
index 24487ab8621..9ed18150869 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
@@ -55,10 +55,6 @@ pipeline:
       - model.trust_remote_code=true
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
-      # Keep concurrent in-flight requests low: a 64-wide flood made cold NVFP4
-      # MoE kernels/flashinfer autotune stall a worker past vLLM's engine<->worker
-      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
-      - data.streaming_prefetch=8
       - training.output_dir=/scratchspace/eagle3
       - training.training_seq_len=4096
       - training.disable_tqdm=true
@@ -71,6 +67,12 @@ pipeline:
       # No spaces in values: nemo_run emits `export FOO=value` unquoted.
       - EAGLE_CAPTURE_IDS: "[2,30,58,60]"
       - SERVE_TP: "4"
+      # DataLoader workers per trainer rank = in-flight requests per rank. The
+      # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0),
+      # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS.
+      # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE
+      # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort.
+      - STREAMING_NUM_WORKERS: "1"
       # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
       # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with
       # 183/184 GB used). Cap context to the training seq len and leave headroom
@@ -90,6 +92,11 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 2
+      # Pin the serve node + trainer node into one NVL72 block. Inter-node here is
+      # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this
+      # is a latency/locality nicety rather than a correctness requirement (cf. the
+      # multi_node examples, where cross-node trainer DDP makes segment essential).
+      segment: 2
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest

From 02656249872cf7d5f42431dda0916caec2ea9e24 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Wed, 3 Jun 2026 10:24:52 +0000
Subject: [PATCH 07/14] add k25 eagle3 multinode straming

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 .../hf_streaming_eagle3_multi_node.yaml       | 151 ++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml

diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
new file mode 100644
index 00000000000..e0c32debb0e
--- /dev/null
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
@@ -0,0 +1,151 @@
+# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell
+# (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of hf_streaming_eagle3.yaml:
+# BOTH sides scale out. SERVE_NODES (=2) run independent vllm serve replicas (TP=4
+# each, whole node); the remaining nodes run multi-node-DDP trainers (4 GPUs each).
+# This file allocates 4 nodes = 2 serve + 2 trainer (world_size=8). Tune the split
+# via slurm_config.nodes/segment + SERVE_NODES:
+#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
+#
+# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
+# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
+# ~138 GB/GPU of weights) with NO cpu-offload. On CW H100 the model needed
+# cpu-offload (-> ~1 tok/s -> vLLM EngineCore TimeoutError), so GB200 is the
+# working path. Each serve replica owns a whole node at TP=4; each trainer node
+# uses all 4 GPUs for the draft (fake base).
+#
+# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
+# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
+# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d
+# rendezvous; every trainer reads all K serve addresses and joins the DDP group.
+# segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
+# rides NVLink.
+#
+# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF
+# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each
+# rank fetches ONLY its own shard, round-robin across the K serve replicas
+# (data.streaming_server_url is the comma-joined list the script assembles). So
+# trainer nodes scale effective batch / compute and distribute the lustre reads;
+# serve nodes scale data-production throughput (~K x), lifting the single-serve
+# ceiling.
+#
+# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below):
+# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
+# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server
+# stalls a worker past vLLM's execute-model timeout and kills EngineCore.
+#
+# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed
+# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60.
+# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base.
+#
+# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+#          SLURM_PARTITION=batch \
+#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
+#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          NEMORUN_HOME=$PWD
+#   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml \
+#          identity=$HOME/.ssh/id_ecdsa detach=True --yes
+
+job_name: Kimi-K2.5-NVFP4_EAGLE3_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+
+  # Step 1: Build input conversations (model-agnostic)
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
+      # so request 4 even though make_dataset is CPU-only.
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each).
+  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
+  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.use_fake_base_for_offline=true
+      - model.trust_remote_code=true
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+      - training.num_train_epochs=1
+      - training.max_steps=500
+      - eagle.eagle_use_torch_compile=false
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces in values: nemo_run emits `export FOO=value` unquoted.
+      - EAGLE_CAPTURE_IDS: "[2,30,58,60]"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi).
+      - SERVE_TP: "4"
+      # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1:
+      # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS.
+      # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker
+      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
+      - STREAMING_NUM_WORKERS: "1"
+      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
+      # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with
+      # 183/184 GB used). Cap context to the training seq len and leave headroom
+      # for activation spikes during the profiling forward.
+      - SERVE_MAX_MODEL_LEN: "4096"
+      # Small batches: smaller per-step MoE compute stays under the engine timeout.
+      - SERVE_MAX_NUM_SEQS: "4"
+      - SERVE_GPU_MEM_UTIL: "0.8"
+      - SERVE_READY_TIMEOUT: "2400"
+      - SERVE_EXTRA_ARGS: "--trust-remote-code"
+      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
+      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
+      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
+      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
+      - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 4
+      segment: 4
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Benchmark speculative decoding (VLLM backend, Kimi served at TP=4)
+  task_2:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 32
+      # Kimi has custom modeling code; bench run.py loads base+tokenizer and needs this.
+      - --trust_remote_code
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest

From fc393663cd39475ac6114e20ad813191936d74bb Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 06:57:20 +0000
Subject: [PATCH 08/14] address comment

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/launch_train.sh | 26 +++++++++++++------
 .../common/eagle3/train_eagle_streaming.sh    | 19 +++++++++++---
 tools/launcher/core.py                        | 13 +++++++---
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
index 3ff34b6ae9a..cd890c72053 100755
--- a/examples/speculative_decoding/launch_train.sh
+++ b/examples/speculative_decoding/launch_train.sh
@@ -59,7 +59,7 @@ else
 fi
 
 # Multi-node routing args (accelerate only; training config comes from the YAML)
-MULTI_NODE_ARGS=""
+MULTI_NODE_ARGS=()
 if [[ "$NUM_NODES" != "1" ]]; then
   # machine_rank: caller may pass --machine_rank explicitly (needed when the
   # SLURM allocation reserves node 0 for something else, e.g. the streaming
@@ -77,17 +77,27 @@ if [[ "$NUM_NODES" != "1" ]]; then
   #
   # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which
   # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip.
-  MULTI_NODE_ARGS="--multi_gpu \
-                   --num_processes $TOTAL_GPU \
-                   --num_machines $NUM_NODES \
-                   --machine_rank ${MACHINE_RANK:-$SLURM_PROCID} \
-                   --main_process_ip $HEAD_NODE_IP \
-                   --main_process_port 29500"
+  MULTI_NODE_ARGS=(
+    --multi_gpu
+    --num_processes "$TOTAL_GPU"
+    --num_machines "$NUM_NODES"
+    --machine_rank "${MACHINE_RANK:-$SLURM_PROCID}"
+    --main_process_ip "$HEAD_NODE_IP"
+    --main_process_port 29500
+  )
 fi
 
 export TOKENIZERS_PARALLELISM=False
 
+# Build the argv directly (no `sh -c`): a re-parsed command string would word-split
+# overrides that contain spaces (e.g. training.output_dir=/tmp/has space) and would
+# execute command substitutions embedded in override values. An array preserves each
+# argument boundary verbatim.
+CMD=(accelerate launch --mixed_precision bf16
+     "${MULTI_NODE_ARGS[@]}"
+     "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}")
+
 set -x
 start_time=$(date +%s)
-sh -c "accelerate launch --mixed_precision bf16 $MULTI_NODE_ARGS ${SCRIPT_DIR}/main.py --config $CONFIG_FILE ${EXTRA_ARGS[*]}"
+"${CMD[@]}"
 echo "Total time: $(( $(date +%s) - $start_time )) seconds"
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index 6f2875e525a..b3637b1621d 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -147,8 +147,13 @@ SERVE_LOG="/scratchspace/vllm_serve.log"   # serve nodes override with a per-nod
 # Rendezvous over the shared /scratchspace mount (lustre, visible on every node):
 # each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer
 # signals completion via DONE_FILE; trainers collect all serve addresses.
-SERVE_ADDR_FILE="/scratchspace/.serve_addr"
-DONE_FILE="/scratchspace/.training_done"
+# Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on
+# the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is
+# identical across every node of one allocation (so the namespacing is consistent)
+# and unique across allocations; falls back to a fixed token off-Slurm (single run).
+RUN_ID="${SLURM_JOB_ID:-local}"
+SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}"
+DONE_FILE="/scratchspace/.training_done.${RUN_ID}"
 SERVE_PID=""
 mkdir -p "$SERVE_SCRATCH"
 
@@ -302,6 +307,14 @@ run_trainer_and_export() {
 NNODES="${SLURM_NNODES:-1}"
 NODEID="${SLURM_NODEID:-0}"
 
+# Multi-node needs at least one trainer node: with SERVE_NODES >= NNODES every node
+# takes the serve branch, so no trainer ever publishes the rendezvous address or the
+# DONE_FILE and the serve nodes block forever. Reject it up front.
+if [ "$NNODES" -gt 1 ] && [ "$SERVE_NODES" -ge "$NNODES" ]; then
+    echo "ERROR: SERVE_NODES ($SERVE_NODES) must be < SLURM_NNODES ($NNODES); need >=1 trainer node." >&2
+    exit 1
+fi
+
 if [ "$NNODES" -le 1 ]; then
     # ----------------------------- single node -----------------------------
     SERVE_HOST="${SERVE_HOST:-127.0.0.1}"
@@ -352,7 +365,7 @@ else
     # mapping to 0-based accelerate machine ranks (head trainer = first trainer node).
     NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES ))
     TRAINER_RANK=$(( NODEID - SERVE_NODES ))
-    TRAINER_ADDR_FILE="/scratchspace/.trainer_addr"
+    TRAINER_ADDR_FILE="/scratchspace/.trainer_addr.${RUN_ID}"  # per-job (see RUN_ID)
 
     # Only the head trainer (rank 0) signals the serve nodes to release on exit;
     # a non-head node exiting first must NOT tear the serves down early.
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index f6ae6493af3..0639d2afac7 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -270,6 +270,15 @@ def build_slurm_executor(
             identity=identity,
         )
 
+    # --segment=<N>: pin all nodes into one topology block (one NVL72 / NVLink domain).
+    # getattr (not attribute access) keeps older/custom SlurmConfig types patched in via
+    # set_slurm_config_type that predate the `segment` field from raising AttributeError.
+    # None -> omit the kwarg entirely so the scheduler places freely (default behavior).
+    optional_kwargs = {}
+    segment = getattr(slurm_config, "segment", None)
+    if segment is not None:
+        optional_kwargs["segment"] = segment
+
     executor = run.SlurmExecutor(
         account=slurm_config.account,
         partition=slurm_config.partition,
@@ -286,9 +295,7 @@ def build_slurm_executor(
         retries=0,
         packager=packager,
         srun_args=slurm_config.srun_args,
-        # --segment=<N>: pin all nodes into one topology block (one NVL72 / NVLink
-        # domain). None -> omitted, scheduler places freely (default behavior).
-        segment=slurm_config.segment,
+        **optional_kwargs,
     )
     return executor
 

From 9882ee0d2cd7eb21455076ce657b2c55ec7be4ac Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 07:30:27 +0000
Subject: [PATCH 09/14] address comments

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 modelopt/recipe/config.py                     | 12 +++
 modelopt/torch/speculative/config.py          | 12 +++
 .../plugins/hf_streaming_dataset.py           | 21 +++++-
 .../speculative/plugins/hf_training_args.py   |  2 +
 .../plugins/test_hf_streaming_dataset.py      | 73 ++++++++++++++++++-
 tools/launcher/core.py                        | 20 +++++
 .../moonshotai/Kimi-K2.5/specdec_bench.yaml   | 11 ++-
 tools/launcher/slurm_config.py                |  2 +
 8 files changed, 144 insertions(+), 9 deletions(-)

diff --git a/modelopt/recipe/config.py b/modelopt/recipe/config.py
index 97d93bbafc6..0932095f6d4 100644
--- a/modelopt/recipe/config.py
+++ b/modelopt/recipe/config.py
@@ -31,6 +31,18 @@
     TrainingArguments as SpecTrainingArgs,
 )
 
+__all__ = [
+    "RECIPE_TYPE_TO_CLASS",
+    "ModelOptDFlashRecipe",
+    "ModelOptEagleRecipe",
+    "ModelOptMedusaRecipe",
+    "ModelOptPTQRecipe",
+    "ModelOptRecipeBase",
+    "ModelOptSpeculativeRecipeBase",
+    "RecipeMetadataConfig",
+    "RecipeType",
+]
+
 
 class RecipeType(str, Enum):
     """List of recipe types. See ``RECIPE_TYPE_TO_CLASS`` at the bottom for the schema mapping."""
diff --git a/modelopt/torch/speculative/config.py b/modelopt/torch/speculative/config.py
index 23ad200b6e7..708deafc0d1 100644
--- a/modelopt/torch/speculative/config.py
+++ b/modelopt/torch/speculative/config.py
@@ -23,6 +23,18 @@
 
 from .eagle.default_config import default_eagle_config, default_kimik2_eagle_config
 
+__all__ = [
+    "DFLASH_DEFAULT_CFG",
+    "EAGLE3_DEFAULT_CFG",
+    "EAGLE_MTP_DEFAULT_CFG",
+    "DFlashConfig",
+    "EagleConfig",
+    "MedusaConfig",
+    "eagle3_default_config",
+    "eagle_mtp_default_config",
+    "kimik2_eagle_default_config",
+]
+
 kimik2_eagle_default_config = deepcopy(default_kimik2_eagle_config)
 
 eagle3_default_config = deepcopy(default_eagle_config)
diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
index 9c3655b76fe..65b2cd4f0d7 100644
--- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
+++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
@@ -62,6 +62,14 @@
 
 from modelopt.torch.utils import print_rank_0, warn_rank_0
 
+__all__ = [
+    "EagleFetchPayload",
+    "EagleVllmStreamingConfig",
+    "EagleVllmStreamingDataset",
+    "StreamingConfig",
+    "StreamingDataset",
+]
+
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 
 # The vLLM connector writes the safetensors file asynchronously (writer thread pool)
@@ -71,6 +79,13 @@
 _READ_RETRIES = 10
 _READ_BACKOFF = 0.05  # seconds
 
+# Errors from ``_fetch`` that are genuinely transient (server overloaded / connection
+# reset / timeout, or the safetensors writer race) and so count against the circuit
+# breaker and trigger a resample. Anything else -- notably the ``RuntimeError`` raised
+# on server token drift, or a programming/contract bug (``ValueError``/``KeyError``) --
+# is a real fault and propagates instead of being silently masked as a fetch miss.
+_TRANSIENT_FETCH_ERRORS = (httpx.HTTPError, OSError, SafetensorError)
+
 
 def _tokenize_with_loss_mask(
     tokenizer,
@@ -201,8 +216,10 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 continue  # entry unfit pre-fetch; server not at fault, try the next one
             try:
                 fetched = self._fetch(sample)
-            except Exception as e:
-                warn_rank_0(f"[streaming] error for {sample['cid']}: {e!r}")
+            except _TRANSIENT_FETCH_ERRORS as e:
+                # Transport/IO miss: count against the circuit breaker and resample.
+                # Contract violations and bugs are not caught here -- they propagate.
+                warn_rank_0(f"[streaming] fetch error for {sample['cid']}: {e!r}")
                 fetched = None
             if fetched is None:
                 self._consecutive_fail += 1
diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py
index a9670ec1efd..2a9d4a1c4ff 100644
--- a/modelopt/torch/speculative/plugins/hf_training_args.py
+++ b/modelopt/torch/speculative/plugins/hf_training_args.py
@@ -33,6 +33,8 @@
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
+__all__ = ["DataArguments", "ModelArguments", "TrainingArguments"]
+
 
 class ModelArguments(BaseModel):
     """Arguments for loading the base HF model."""
diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
index a0a37bc7afa..d4b910fe237 100644
--- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
+++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
@@ -85,8 +85,8 @@ def _format(self, fetched):
 
 
 def test_circuit_breaker_trips_on_consecutive_failures():
-    """When _fetch keeps failing, __getitem__ raises after the threshold instead of
-    silently resampling the whole corpus."""
+    """When _fetch keeps hitting transient errors (server down), __getitem__ raises
+    after the threshold instead of silently resampling the whole corpus."""
     threshold = 3
 
     class _AlwaysFails(StreamingDataset):
@@ -94,7 +94,8 @@ def _tokenize_entry(self, entry):
             return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
 
         def _fetch(self, sample):
-            raise RuntimeError("simulated server failure")
+            # A down server surfaces as a transport error, which the breaker counts.
+            raise httpx.ConnectError("simulated server down")
 
     ds = _AlwaysFails(
         _entries(20),
@@ -105,6 +106,28 @@ def _fetch(self, sample):
         ds[0]
 
 
+def test_contract_violation_propagates_not_swallowed():
+    """A non-transient error from _fetch (e.g. a contract violation / bug) must
+    surface immediately, not be masked as a fetch miss and silently resampled."""
+
+    class _BadContract(StreamingDataset):
+        def _tokenize_entry(self, entry):
+            return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
+
+        def _fetch(self, sample):
+            raise RuntimeError("server token_ids drift")
+
+    ds = _BadContract(
+        _entries(20),
+        tokenizer=MagicMock(),
+        # High threshold: if the error were (wrongly) swallowed, the breaker wouldn't
+        # fire, so a leaked breaker message would mask the regression.
+        config=StreamingConfig(fail_after_consecutive_skips=100),
+    )
+    with pytest.raises(RuntimeError, match="server token_ids drift"):
+        ds[0]
+
+
 def test_fetch_returning_none_exhausts_then_raises():
     """If every entry's fetch yields None (e.g. all rejected), __getitem__ raises a
     clear 'no fetchable sample' error rather than hanging or returning junk."""
@@ -238,6 +261,50 @@ def handler(request: httpx.Request) -> httpx.Response:
     assert list(scratch.iterdir()) == [], "scratch files must be unlinked after fetch"
 
 
+def test_fetch_round_robins_across_server_urls(tmp_path, monkeypatch):
+    """With multiple server_urls, consecutive fetches alternate across endpoints so
+    load is spread over replicas rather than pinned to the first one."""
+    seq, n_layers, hidden = 8, 3, 16
+    scratch = tmp_path / "vllm_scratch"
+    scratch.mkdir()
+
+    hosts: list[str] = []
+    counter = {"n": 0}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        hosts.append(request.url.host)
+        counter["n"] += 1
+        path = scratch / f"req_{counter['n']}.safetensors"
+        _write_canned_safetensors(path, seq, n_layers, hidden)
+        return httpx.Response(
+            200,
+            json={"kv_transfer_params": {"hidden_states_path": str(path)}},
+        )
+
+    _patch_sync_client(monkeypatch, handler)
+
+    n_entries = 4
+    entries = [
+        {"conversation_id": f"c-{i}", "messages": [{"role": "user", "content": "x"}]}
+        for i in range(n_entries)
+    ]
+    ds = EagleVllmStreamingDataset(
+        entries=entries,
+        tokenizer=_tokenizer_returning(seq),
+        config=EagleVllmStreamingConfig(
+            server_urls=["http://a:8000", "http://b:8000"],
+            model="mock-model",
+            shared_storage_root=str(scratch),
+        ),
+    )
+
+    for i in range(n_entries):
+        ds[i]
+
+    # Per-process round-robin cursor: a, b, a, b -- one request each, alternating.
+    assert hosts == ["a", "b", "a", "b"]
+
+
 def test_path_outside_shared_storage_root_is_rejected(tmp_path, monkeypatch):
     """Out-of-root path from the server is not opened or unlinked; the fetch yields
     None, so the single-entry corpus is exhausted and __getitem__ raises."""
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index 0639d2afac7..dcdd86aad40 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -28,6 +28,26 @@
 import nemo_run as run
 import yaml
 
+__all__ = [
+    "DEFAULT_EXPERIMENT_TITLE",
+    "GlobalVariables",
+    "SandboxPipeline",
+    "SandboxTask",
+    "SandboxTask0",
+    "SandboxTask1",
+    "SandboxTask2",
+    "SandboxTask3",
+    "SandboxTask4",
+    "build_docker_executor",
+    "build_slurm_executor",
+    "create_task_from_yaml",
+    "get_default_env",
+    "register_factory",
+    "report_versions",
+    "run_jobs",
+    "set_slurm_config_type",
+]
+
 # ---------------------------------------------------------------------------
 # Default environment variables injected into every job
 # ---------------------------------------------------------------------------
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
index a943f39c27e..84a77217453 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
@@ -18,8 +18,9 @@
 #     yaml and add `- --runtime_params <path>` below — see
 #     examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern).
 #   - --draft_model_dir must point at a trained+exported HF-format DFLASH draft
-#     (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Edit the path
-#     below, or override on the CLI: pipeline.task_0.args[0]="--draft_model_dir /hf-local/<draft>"
+#     (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Set it via the
+#     `draft_model_dir` global_var below, or override on the CLI:
+#     pipeline.global_vars.draft_model_dir=/hf-local/<draft>
 #   - Kimi needs --trust_remote_code for both tokenizer and model.
 #
 # NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so
@@ -53,12 +54,14 @@ pipeline:
 
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+    # Trained + exported HF-format DFLASH draft checkpoint. Defaults to the standard
+    # export path; override on the CLI with: pipeline.global_vars.draft_model_dir=<path>
+    draft_model_dir: /hf-local/nvidia/Kimi-K2.5-DFlash
 
   task_0:
     script: common/specdec_bench/run.sh
     args:
-      # TODO: point at your trained + exported HF-format DFLASH draft checkpoint.
-      - --draft_model_dir /hf-local/nvidia/Kimi-K2.5-DFlash
+      - --draft_model_dir <<global_vars.draft_model_dir>>
       - --speculative_algorithm DFLASH
       - --engine VLLM
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index 0bcfff14ad9..9c3c853e877 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -24,6 +24,8 @@
 
 import nemo_run as run
 
+__all__ = ["SlurmConfig", "slurm_factory"]
+
 
 @dataclass
 class SlurmConfig:

From 9024ca92ddbefb649c7abcab628ab766ceaf7eb5 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 08:02:02 +0000
Subject: [PATCH 10/14] polish

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/eagle_utils.py  |  3 +-
 examples/speculative_decoding/main.py         | 15 +++---
 .../speculative/plugins/hf_training_args.py   |  6 +--
 .../plugins/test_hf_streaming_dataset.py      | 50 +++++++++++++++++++
 .../Qwen/Qwen3-8B/hf_streaming_eagle3.yaml    |  1 -
 5 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
index bcdcf15e8c2..f3ef93d740e 100644
--- a/examples/speculative_decoding/eagle_utils.py
+++ b/examples/speculative_decoding/eagle_utils.py
@@ -59,7 +59,6 @@ def make_speculative_data_module(
     train_len=None,
     answer_only_loss=False,
     shift_labels=True,
-    seed: int = 0,
 ) -> dict:
     """Create data module for speculative decoding training.
 
@@ -90,7 +89,7 @@ def make_speculative_data_module(
             ds = ds.select(range(data_args.sample_size))
         # Map-style dataset: each rank fetches its own DistributedSampler shard.
         # Fetch concurrency comes from the DataLoader's num_workers, not a config knob;
-        # shuffling/order is the sampler's job, so no seed is threaded here.
+        # shuffling/order is the sampler's job (seeded by training_args.seed).
         # ``server_urls`` accepts a comma-separated string for multi-server fan-out.
         streaming_cfg = EagleVllmStreamingConfig(
             server_urls=data_args.streaming_server_url,
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
index 4405bf0cd90..bbb6cbd478a 100644
--- a/examples/speculative_decoding/main.py
+++ b/examples/speculative_decoding/main.py
@@ -267,7 +267,6 @@ def train():
         train_len=training_args.training_seq_len,
         answer_only_loss=training_args.answer_only_loss,
         shift_labels=not is_dflash,
-        seed=training_args.seed,
     )
 
     callbacks = [EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)]
@@ -277,13 +276,13 @@ def train():
         and recipe.eagle.eagle_base_lora_warmup_steps > 0
     ):
         callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps))
-    if recipe.data.mode == "streaming":
-        # The streaming dataset is map-style, so HF Trainer's default resume would
-        # fast-forward by re-iterating (= re-fetching) every consumed batch just to
-        # discard it, hammering the server. Disable the data skip: on resume, weights/
-        # optimizer/global_step still restore from the checkpoint; only the data order
-        # restarts from the top (acceptable for single-epoch streaming).
-        training_args.ignore_data_skip = True
+    # NB: do NOT set training_args.ignore_data_skip for streaming. The dataset is
+    # map-style, so HF Trainer's resume skip goes through accelerate.skip_first_batches,
+    # which drops the already-consumed indices at the batch-sampler level -- those
+    # indices never reach __getitem__, so no hidden states are re-fetched from the
+    # server. Resume therefore lands at the exact data position for free (correct even
+    # when a single epoch is split across many checkpointed segments). ignore_data_skip
+    # would instead restart the data order from the top, silently re-running data.
 
     trainer = EagleTrainerWithAccLog(
         model=model,
diff --git a/modelopt/torch/speculative/plugins/hf_training_args.py b/modelopt/torch/speculative/plugins/hf_training_args.py
index 2a9d4a1c4ff..6f86a467ab2 100644
--- a/modelopt/torch/speculative/plugins/hf_training_args.py
+++ b/modelopt/torch/speculative/plugins/hf_training_args.py
@@ -31,7 +31,7 @@
 
 from typing import Literal
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic import BaseModel, ConfigDict, field_validator, model_validator
 
 __all__ = ["DataArguments", "ModelArguments", "TrainingArguments"]
 
@@ -64,10 +64,6 @@ class DataArguments(BaseModel):
     sample_size: int = -1
     streaming_server_url: str | None = None
     streaming_model_name: str | None = None
-    # Deprecated / no-op: the streaming dataset is map-style now, so fetch concurrency
-    # comes from the DataLoader's ``dataloader_num_workers``, not this knob. Kept so
-    # existing yamls that set ``data.streaming_prefetch`` still validate.
-    streaming_prefetch: int = Field(default=64, ge=1)
     # Mirror of the vLLM connector's ``shared_storage_path``; trainer-side allowlist.
     streaming_shared_storage_path: str | None = None
 
diff --git a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
index d4b910fe237..e6bac5b9755 100644
--- a/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
+++ b/tests/unit/torch/speculative/plugins/test_hf_streaming_dataset.py
@@ -148,6 +148,56 @@ def _fetch(self, sample):
         ds[0]
 
 
+def test_resume_skips_consumed_samples_without_refetching():
+    """Map-style resume contract: HF Trainer skips consumed batches via
+    accelerate.skip_first_batches, which drops their indices at the batch-sampler
+    level so __getitem__ (and thus _fetch) is never called for them. This is why
+    main.py leaves ignore_data_skip at its default (False) for streaming -- resume
+    lands at the exact position with no re-fetch. Guards against a regression that
+    would re-fetch (or re-stream) already-consumed samples on resume."""
+    pytest.importorskip("accelerate")
+    from accelerate import skip_first_batches
+    from torch.utils.data import DataLoader, RandomSampler
+
+    fetched: list[int] = []
+
+    class _Recording(StreamingDataset):
+        def _tokenize_entry(self, entry):
+            return {"cid": str(entry["id"]), "token_ids": [1], "loss_mask": None}
+
+        def _fetch(self, sample):
+            cid = int(sample["cid"])
+            fetched.append(cid)  # stands in for the HTTP fetch
+            return {"cid": cid}
+
+        def _format(self, payload):
+            return torch.tensor(payload["cid"])
+
+    n, batch_size, skip_batches = 20, 2, 3
+    ds = _Recording(_entries(n), tokenizer=MagicMock(), config=StreamingConfig())
+
+    def make_dl():
+        # Fresh, identically-seeded sampler -> identical permutation across runs.
+        return DataLoader(
+            ds,
+            batch_size=batch_size,
+            sampler=RandomSampler(ds, generator=torch.Generator().manual_seed(0)),
+        )
+
+    # Full pass -> ground-truth consumption order (cid == requested index here).
+    full_order = [int(x) for batch in make_dl() for x in batch]
+    fetched.clear()
+
+    # Resume: skip the first `skip_batches` batches.
+    tail_order = [int(x) for batch in skip_first_batches(make_dl(), skip_batches) for x in batch]
+
+    consumed = full_order[: skip_batches * batch_size]
+    expected_tail = full_order[skip_batches * batch_size :]
+    assert tail_order == expected_tail, "resume must continue at the exact data position"
+    assert set(fetched).isdisjoint(consumed), "skipped (consumed) samples must not be re-fetched"
+    assert fetched == expected_tail, "only the un-consumed tail is fetched after resume"
+
+
 def test_server_urls_normalization():
     """server_urls accepts a single string, a comma-separated string, or a list, and
     strips trailing slashes."""
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
index d93525632f3..91b8c54a8b5 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
@@ -48,7 +48,6 @@ pipeline:
       - model.model_name_or_path=<<global_vars.hf_model>>
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
-      - data.streaming_prefetch=64
       - training.output_dir=/scratchspace/eagle3
       - training.training_seq_len=4096
       - training.disable_tqdm=true

From 94d8bd4378118883b072f22a10df68e27c801f52 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 08:14:27 +0000
Subject: [PATCH 11/14] trim comments

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/launch_train.sh | 28 +++------
 examples/speculative_decoding/main.py         | 11 ++--
 .../common/eagle3/train_eagle_streaming.sh    | 53 +++++++---------
 .../hf_streaming_eagle3_multi_node.yaml       |  9 ++-
 .../Kimi-K2.5/hf_dflash_dryrun.yaml           | 20 +++---
 .../Kimi-K2.5/hf_streaming_dflash.yaml        | 63 ++++++++-----------
 .../hf_streaming_dflash_multi_node.yaml       | 56 +++++++----------
 .../Kimi-K2.5/hf_streaming_eagle3.yaml        | 13 ++--
 .../hf_streaming_eagle3_multi_node.yaml       | 22 +++----
 .../moonshotai/Kimi-K2.5/specdec_bench.yaml   |  6 +-
 10 files changed, 115 insertions(+), 166 deletions(-)

diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
index cd890c72053..fdcc123bb4a 100755
--- a/examples/speculative_decoding/launch_train.sh
+++ b/examples/speculative_decoding/launch_train.sh
@@ -61,22 +61,12 @@ fi
 # Multi-node routing args (accelerate only; training config comes from the YAML)
 MULTI_NODE_ARGS=()
 if [[ "$NUM_NODES" != "1" ]]; then
-  # machine_rank: caller may pass --machine_rank explicitly (needed when the
-  # SLURM allocation reserves node 0 for something else, e.g. the streaming
-  # vllm serve, so SLURM_PROCID is offset from accelerate's 0-based rank).
-  # Default to $SLURM_PROCID for the all-nodes-are-trainers case.
-  # Canonical accelerate multi-node launch for a fixed Slurm allocation:
-  # --multi_gpu + static rendezvous via main_process_ip/port (-> MASTER_ADDR/PORT).
-  #
-  # --multi_gpu is REQUIRED: with 1 GPU/node, each node's local process count is
-  # num_processes/num_machines = 1, and without --multi_gpu accelerate treats a
-  # single local process as non-distributed -- it never sets WORLD_SIZE/RANK or
-  # forms the process group, so every node trains the full dataset as its own
-  # world=1 (no hang, no real DDP). --multi_gpu forces DistributedType.MULTI_GPU
-  # so the nodes rendezvous into one world=$TOTAL_GPU group.
-  #
-  # Do NOT add --rdzv_backend c10d: that switches to the elastic launcher, which
-  # reads its endpoint from --rdzv_endpoint and ignores --main_process_ip.
+  # machine_rank defaults to $SLURM_PROCID; pass --machine_rank explicitly when the
+  # allocation reserves node 0 for something else (e.g. a streaming vllm serve).
+  # --multi_gpu is required even at 1 GPU/node -- without it accelerate treats a lone
+  # local process as non-distributed and never forms the process group (each node
+  # would train its own world=1). Use static rendezvous via main_process_ip/port; NOT
+  # --rdzv_backend c10d, which switches to the elastic launcher and ignores it.
   MULTI_NODE_ARGS=(
     --multi_gpu
     --num_processes "$TOTAL_GPU"
@@ -89,10 +79,8 @@ fi
 
 export TOKENIZERS_PARALLELISM=False
 
-# Build the argv directly (no `sh -c`): a re-parsed command string would word-split
-# overrides that contain spaces (e.g. training.output_dir=/tmp/has space) and would
-# execute command substitutions embedded in override values. An array preserves each
-# argument boundary verbatim.
+# Run as an argv array (not `sh -c "..."`, which would word-split overrides
+# containing spaces and execute command substitutions embedded in their values).
 CMD=(accelerate launch --mixed_precision bf16
      "${MULTI_NODE_ARGS[@]}"
      "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}")
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
index bbb6cbd478a..f62b099121d 100644
--- a/examples/speculative_decoding/main.py
+++ b/examples/speculative_decoding/main.py
@@ -276,13 +276,10 @@ def train():
         and recipe.eagle.eagle_base_lora_warmup_steps > 0
     ):
         callbacks.append(LoRAWarmupCallback(recipe.eagle.eagle_base_lora_warmup_steps))
-    # NB: do NOT set training_args.ignore_data_skip for streaming. The dataset is
-    # map-style, so HF Trainer's resume skip goes through accelerate.skip_first_batches,
-    # which drops the already-consumed indices at the batch-sampler level -- those
-    # indices never reach __getitem__, so no hidden states are re-fetched from the
-    # server. Resume therefore lands at the exact data position for free (correct even
-    # when a single epoch is split across many checkpointed segments). ignore_data_skip
-    # would instead restart the data order from the top, silently re-running data.
+    # Leave training_args.ignore_data_skip at its default (False). The dataset is
+    # map-style, so HF Trainer's resume skips consumed indices at the batch-sampler
+    # level (accelerate.skip_first_batches) without re-fetching them, landing at the
+    # exact data position. Setting it True would restart the data order from the top.
 
     trainer = EagleTrainerWithAccLog(
         model=model,
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index b3637b1621d..cd6bee89ac0 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -23,7 +23,7 @@
 # `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it
 # branches on $SLURM_NODEID:
 #   nodes == 1       -> co-located: vllm serve on $SERVE_GPU, trainer on the rest
-#                       of the local GPUs (original single-node behavior).
+#                       of the local GPUs.
 #   nodes >= 2       -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent
 #                       vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1
 #                       are trainers doing multi-node DDP. SERVE_NODES defaults to 1
@@ -36,8 +36,8 @@
 # The streaming dataset is map-style: HF Trainer's DistributedSampler shards the
 # corpus across all trainer ranks and each rank fetches ONLY its own shard,
 # round-robin across the SERVE_NODES replicas (data.streaming_server_url is the
-# comma-joined list). So trainer nodes scale effective batch / compute and
-# distribute the reads; serve nodes scale data-production throughput (~K x).
+# comma-joined list). Trainer nodes scale compute and distribute the reads; serve
+# nodes scale data-production throughput.
 #
 # Env vars (required):
 #   HF_MODEL_CKPT       Target model path. Used by both vllm serve (as the
@@ -79,14 +79,12 @@ source "${SCRIPT_DIR}/../service_utils.sh"
 # Container provisioning
 #
 # vllm/vllm-openai:* has vllm and torch but not modelopt or the speculative
-# trainer's deps. modelopt is bind-mounted at
-# /usr/local/lib/python3.12/dist-packages/modelopt, but it has no .dist-info
-# (so `importlib.metadata.version('nvidia-modelopt')` would fail). nemo_run
-# only ships modelopt subdirs, not the real pyproject.toml, so we synthesize
-# a minimal one with a correctly-scoped setuptools.packages.find include —
-# without `include = ["modelopt*"]`, setuptools sees both `modelopt/` and
-# `modelopt_recipes/` at the top level and refuses with a "flat-layout"
-# error. We then `pip install -e .` to register the dist-info.
+# trainer's deps. modelopt is bind-mounted but has no .dist-info (so
+# `importlib.metadata.version('nvidia-modelopt')` would fail), and nemo_run does
+# not ship the real pyproject.toml, so we synthesize a minimal one and
+# `pip install -e .` to register the dist-info. The setuptools.packages.find
+# `include` must be scoped (modelopt*, modelopt_recipes*) or setuptools sees two
+# top-level packages and fails with a "flat-layout" error.
 
 TOML=modules/Model-Optimizer/pyproject.toml
 if [ ! -f "$TOML" ]; then
@@ -144,13 +142,13 @@ SERVE_NODES="${SERVE_NODES:-1}"
 # by a unique vllm request id, so they don't collide across servers.
 SERVE_SCRATCH="/scratchspace/streaming_serve_scratch"
 SERVE_LOG="/scratchspace/vllm_serve.log"   # serve nodes override with a per-node path
-# Rendezvous over the shared /scratchspace mount (lustre, visible on every node):
-# each serve node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer
-# signals completion via DONE_FILE; trainers collect all serve addresses.
+# Rendezvous over the shared /scratchspace mount (visible on every node): each serve
+# node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer signals
+# completion via DONE_FILE; trainers collect all serve addresses.
 # Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on
 # the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is
-# identical across every node of one allocation (so the namespacing is consistent)
-# and unique across allocations; falls back to a fixed token off-Slurm (single run).
+# identical across every node of one allocation and unique across allocations; falls
+# back to a fixed token off-Slurm (single run).
 RUN_ID="${SLURM_JOB_ID:-local}"
 SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}"
 DONE_FILE="/scratchspace/.training_done.${RUN_ID}"
@@ -199,11 +197,9 @@ launch_vllm() {
     # features skip recomputing cached/partial prefixes, which yields short or
     # empty hidden_states. Required, not optional.
     # --no-enable-flashinfer-autotune: on big NVFP4 MoE (Kimi) the flashinfer
-    # trtllm_fp4_block_scale_moe autotuner re-tunes on the first real serving
-    # step and stalls a worker past vLLM's execute-model timeout -> EngineCore
-    # dies with "RPC call to sample_tokens timed out" -> 500s -> trainer aborts.
-    # Disabling autotune keeps kernels static (and pairs with the larger
-    # VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS set in the example env).
+    # autotuner re-tunes on the first real serving step and stalls a worker past
+    # vLLM's execute-model timeout, killing EngineCore and aborting the trainer.
+    # Required there; keeps kernels static.
     "${gpu_env[@]}" vllm serve "$HF_MODEL_CKPT" \
         --host "$bind_host" \
         --port "$SERVE_PORT" \
@@ -249,22 +245,21 @@ wait_vllm_ready() {
 
 # Run the trainer then export the HF checkpoint.
 #   $1 = streaming server base URL   $2 = CUDA_VISIBLE_DEVICES ("" -> all)
-# The streaming dataset is map-style now, so fetch concurrency comes from the
-# DataLoader's workers (each worker = one in-flight request). STREAMING_NUM_WORKERS
-# sets that; keep it modest so (ranks-per-server x workers) stays near the server's
-# max_num_seqs (flooding a cold NVFP4 MoE server kills EngineCore). 0 disables
-# prefetch (serialized fetches) and is usually too slow.
+# Fetch concurrency comes from the DataLoader's workers (each worker = one in-flight
+# request). STREAMING_NUM_WORKERS sets that; keep it modest so (ranks-per-server x
+# workers) stays near the server's max_num_seqs (flooding a cold NVFP4 MoE server
+# kills EngineCore). 0 disables prefetch (serialized fetches) and is usually too slow.
 run_trainer_and_export() {
     local url="$1" cvd="$2"
-    # Optional multi-node trainer routing (see dispatch section). Defaults keep
-    # the original single-trainer-node behavior: no --num_nodes, export on rank 0.
+    # Optional multi-node trainer routing (see dispatch section). Defaults: single
+    # trainer node, no --num_nodes, export on rank 0.
     local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}"
     echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..."
     # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all).
     local -a gpu_env=()
     [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
     # Engage accelerate multi-node routing only when >1 trainer node; a single
-    # trainer node keeps the original invocation (no --num_nodes) verbatim.
+    # trainer node omits --num_nodes.
     local -a mn_args=()
     if [ "${num_tnodes}" -gt 1 ]; then
         mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank")
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
index aac5b71ecdf..d0c99f6f0be 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
@@ -15,9 +15,8 @@
 # How it scales: the dataset is map-style, so HF Trainer's DistributedSampler
 # shards the corpus across ALL trainer ranks and each rank fetches ONLY its own
 # shard, round-robin across the K serve replicas (data.streaming_server_url is the
-# comma-joined list). Trainer nodes scale effective batch / compute and distribute
-# the lustre reads; serve nodes scale data-production throughput (~K x), lifting
-# the single-serve ceiling.
+# comma-joined list). Trainer nodes scale effective batch / compute; serve nodes
+# scale data-production throughput (~K x).
 #
 # 3-step pipeline:
 #   task_0: Build input conversations (jsonl)
@@ -79,8 +78,8 @@ pipeline:
       # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
       - SERVE_NODES: "2"
       # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES.
-      # Here 4 ranks (2 nodes x 2 GPU) x 4 / 2 serves = 8 concurrent per serve — fine
-      # for Qwen's max_num_seqs.
+      # Here 4 ranks x 4 / 2 serves = 8 concurrent per serve — fine for Qwen's
+      # max_num_seqs.
       - STREAMING_NUM_WORKERS: "4"
     slurm_config:
       _factory_: "slurm_factory"
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
index b12c3b0f538..47ef2950b95 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
@@ -9,27 +9,25 @@
 #                                            the (untrained) ModelOpt checkpoint
 #                                            to training.output_dir right after
 #                                            mtsp.convert(model, [("dflash", ...)])
-#   data.offline_data_path=<placeholder>   → DataArguments derives data.mode from
-#                                            the data-source fields, so setting an
-#                                            offline path makes mode='offline' →
-#                                            use_offline_training=True. Combined
-#                                            with use_fake_base_for_offline=true
-#                                            this loads a FakeBaseModel (only
+#   data.offline_data_path=<placeholder>   → setting an offline path makes
+#                                            mode='offline' → use_offline_training
+#                                            =True. Combined with
+#                                            use_fake_base_for_offline=true this
+#                                            loads a FakeBaseModel (only
 #                                            embed_tokens + lm_head), so the ~1T
 #                                            MoE base fits on a single GPU. The
 #                                            file is never read in --dry_run mode.
 #   model.trust_remote_code=true           → Kimi-K2.5 (deepseek_v3 arch) ships a
 #                                            custom modeling file
-#   dflash.dflash_mask_token_id=163838     → Kimi-K2.5 has no dedicated mask token
-#                                            ([EOS]=163585, [PAD]=163839); 163838 is
-#                                            a reserved slot used as the DFlash mask
-#                                            (matches the real Kimi-K2.5 DFlash run)
+#   dflash.dflash_mask_token_id=163838     → Kimi-K2.5 has no dedicated mask token;
+#                                            163838 is a reserved slot used as the
+#                                            DFlash mask
 #
 # The dflash_online_training.sh export block then writes an HF-format DFlash draft
 # to /scratchspace/dflash/exported-checkpoint-final with the correct architecture
 # (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by
 # design. Useful for smoke-testing the launcher / convert / export plumbing and
-# validating downstream loaders without paying for a real training run.
+# validating downstream loaders without a real training run.
 #
 # Usage:
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
index d16ca3822c7..62e3c742e65 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
@@ -1,27 +1,23 @@
 # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
 # GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve +
-# trainer split, same hardware reasoning — but trains a DFlash drafter instead of
-# EAGLE3 by pointing the (shared, algorithm-agnostic) streaming script at the
-# dflash recipe.
+# trainer split, but trains a DFlash drafter instead of EAGLE3 by pointing the
+# shared, algorithm-agnostic streaming script at the dflash recipe.
 #
-# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
-# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
-# ~138 GB/GPU of weights) with NO cpu-offload. So: node 0 = vllm serve (TP=4,
-# whole node), node 1 = DFlash trainer (fake base), 4 GPUs each, 2 nodes.
+# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4
+# model fit at TP=4 on ONE 4-GPU node with no cpu-offload. Topology: node 0 =
+# vllm serve (TP=4, whole node), node 1 = DFlash trainer (fake base), 2 nodes.
 #
-# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
-# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
-# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets
-# dflash_offline=True and the DFlash module consumes the streamed hidden states
-# (base_model_outputs) instead of running the fake base. The vLLM connector,
-# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps
-# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST
-# captured layer into base_model_hidden_states (used for DFlash self-logit
-# distillation) and the REST into aux_hidden_states (DFlash's concatenated
-# target-layer features). So n_captured must be (num DFlash target layers + 1).
+# How streaming feeds DFlash: data.mode=streaming sets dflash_offline=True
+# (derived in modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline),
+# so the DFlash module consumes the streamed hidden states (base_model_outputs)
+# instead of running the fake base. vLLM dumps captured layers as
+# [seq, n_captured, hidden]; the dataset splits the LAST captured layer into
+# base_model_hidden_states (DFlash self-logit distillation) and the REST into
+# aux_hidden_states (DFlash's concatenated target-layer features). So n_captured
+# must be (num DFlash target layers + 1).
 #
 # Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
-# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3):
+# the true final layer is NOT capturable so we use 60 as the base):
 #   DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5)
 #   = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) =
 #   [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the
@@ -30,8 +26,8 @@
 # answer_only_loss: forced false here. DFlash's recipe default is true, which
 # requires the tokenizer chat template to carry {% generation %} tags so the
 # streaming dataset can derive an assistant-token mask; Kimi's template does not,
-# and the streaming path (unlike online) does not inject data.chat_template. To
-# train assistant-only later, supply a generation-tagged template and flip this on.
+# and the streaming path does not inject data.chat_template. To train
+# assistant-only later, supply a generation-tagged template and flip this on.
 #
 # Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
 #   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
@@ -70,8 +66,7 @@ pipeline:
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node.
-  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh):
-  # only the --config recipe (dflash vs eagle3) and EAGLE_CAPTURE_IDS differ.
+  # Reuses the shared streaming orchestrator common/eagle3/train_eagle_streaming.sh.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -102,27 +97,24 @@ pipeline:
       # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
       - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
       - SERVE_TP: "4"
-      # DataLoader workers per trainer rank = in-flight requests per rank. The
-      # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0),
-      # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS.
-      # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE
-      # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort.
+      # DataLoader workers per trainer rank = in-flight requests per rank. All
+      # trainer ranks fetch, so per-serve in-flight = trainer_world_size(4) x
+      # STREAMING_NUM_WORKERS. Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood
+      # stalls a cold NVFP4-MoE worker past vLLM's timeout and kills EngineCore.
       - STREAMING_NUM_WORKERS: "1"
       # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
-      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
-      # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the
-      # training seq len and leave headroom for activation spikes.
+      # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context
+      # to the training seq len, leaving headroom for activation spikes.
       - SERVE_MAX_MODEL_LEN: "4096"
       # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
-      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
-      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
-      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      # A worker can stall on the first real serving step (cold NVFP4 MoE kernels)
+      # past vLLM's default execute-model timeout, killing EngineCore. Extend the
+      # timeouts (seconds) that govern that path.
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
@@ -130,8 +122,7 @@ pipeline:
       nodes: 2
       # Pin the serve node + trainer node into one NVL72 block. Inter-node here is
       # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this
-      # is a latency/locality nicety rather than a correctness requirement (cf. the
-      # multi_node examples, where cross-node trainer DDP makes segment essential).
+      # is a latency/locality nicety rather than a correctness requirement.
       segment: 2
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
index b645a0e428c..703f636d53b 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -6,9 +6,8 @@
 # (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES:
 #   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
 #
-# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
-# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
-# ~138 GB/GPU of weights) with NO cpu-offload. So each serve replica owns a whole
+# Why GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 model fit
+# at TP=4 on ONE 4-GPU node with no cpu-offload. So each serve replica owns a whole
 # node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base).
 #
 # Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
@@ -18,29 +17,24 @@
 # segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
 # rides NVLink.
 #
-# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF
-# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each
-# rank fetches ONLY its own shard, round-robin across the K serve replicas
-# (data.streaming_server_url is the comma-joined list the script assembles). So
-# trainer nodes scale effective batch / compute and distribute the lustre reads;
-# serve nodes scale data-production throughput (~K x), lifting the single-serve
-# ceiling that bounded the old single-serve multi-node path.
+# How it scales: HF Trainer's DistributedSampler shards the corpus across ALL
+# trainer ranks and each rank fetches ONLY its own shard, round-robin across the K
+# serve replicas (data.streaming_server_url is the comma-joined list the script
+# assembles). Trainer nodes scale effective batch / compute and distribute the
+# lustre reads; serve nodes scale data-production throughput (~K x).
 #
-# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below):
-# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
-# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server
-# stalls a worker past vLLM's execute-model timeout and kills EngineCore.
+# Concurrency: keep it low for cold NVFP4 MoE (see SERVE_MAX_NUM_SEQS below).
+# Per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
+# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4.
 #
-# How streaming feeds DFlash (vs EAGLE3): the trainer's streaming path was wired
-# up for DFlash by deriving dflash_offline from data.mode (modelopt/recipe/config.py
-# ModelOptDFlashRecipe._derive_dflash_offline), so data.mode=streaming sets
-# dflash_offline=True and the DFlash module consumes the streamed hidden states
-# (base_model_outputs) instead of running the fake base. The vLLM connector,
-# streaming dataset, and offline collator are all algorithm-agnostic: vLLM dumps
-# captured layers as [seq, n_captured, hidden]; the dataset splits the LAST
-# captured layer into base_model_hidden_states (used for DFlash self-logit
-# distillation) and the REST into aux_hidden_states (DFlash's concatenated
-# target-layer features). So n_captured must be (num DFlash target layers + 1).
+# How streaming feeds DFlash: data.mode=streaming derives dflash_offline=True
+# (modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), so the
+# DFlash module consumes the streamed hidden states (base_model_outputs) instead of
+# running the fake base. vLLM dumps captured layers as [seq, n_captured, hidden];
+# the dataset splits the LAST captured layer into base_model_hidden_states (DFlash
+# self-logit distillation) and the REST into aux_hidden_states (DFlash's
+# concatenated target-layer features). So n_captured must be (num DFlash target
+# layers + 1).
 #
 # Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
 # the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3):
@@ -130,24 +124,20 @@ pipeline:
       - SERVE_TP: "4"
       # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1:
       # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS.
-      # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker
-      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
       - STREAMING_NUM_WORKERS: "1"
       # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
-      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
-      # native max_seq_len is 262144, whose KV cache OOMs. Cap context to the
-      # training seq len and leave headroom for activation spikes.
+      # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context
+      # to the training seq len and leave headroom for activation spikes.
       - SERVE_MAX_MODEL_LEN: "4096"
       # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
-      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
-      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
-      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      # A worker can stall on the first real serving step (cold NVFP4 MoE kernels)
+      # past vLLM's default execute-model timeout, killing EngineCore. Extend the
+      # timeouts that govern that path (seconds).
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
index 9ed18150869..3f6cf23d1b7 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
@@ -1,12 +1,9 @@
-# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell
-# (HSG). This is the streaming config that actually runs end-to-end: on CW H100
-# the ~551 GB model needed cpu-offload (-> ~1 tok/s -> vLLM EngineCore
-# TimeoutError), so the working path is GB200.
+# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell (HSG).
 #
-# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
-# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
-# ~138 GB/GPU of weights) with NO cpu-offload. So here: node 0 = vllm serve
-# (TP=4, whole node), node 1 = EAGLE3 trainer (fake base), 4 GPUs each, 2 nodes.
+# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE
+# 4-GPU node with no cpu-offload (on H100 it needs offload and is too slow to be
+# usable). Topology: node 0 = vllm serve (TP=4, whole node), node 1 = EAGLE3 trainer
+# (fake base); 4 GPUs each, 2 nodes.
 #
 # Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed
 # by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60.
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
index e0c32debb0e..bddf6b06909 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
@@ -6,12 +6,9 @@
 # via slurm_config.nodes/segment + SERVE_NODES:
 #   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
 #
-# Why GB200: nodes have only 4 GPUs each (vs CW's 8), but 192 GB/GPU and native
-# NVFP4. Kimi-K2.5-NVFP4 (~551 GB) fits at TP=4 on ONE node (4 x 192 = 768 GB,
-# ~138 GB/GPU of weights) with NO cpu-offload. On CW H100 the model needed
-# cpu-offload (-> ~1 tok/s -> vLLM EngineCore TimeoutError), so GB200 is the
-# working path. Each serve replica owns a whole node at TP=4; each trainer node
-# uses all 4 GPUs for the draft (fake base).
+# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE
+# 4-GPU node with no cpu-offload. Each serve replica owns a whole node at TP=4; each
+# trainer node uses all 4 GPUs for the draft (fake base).
 #
 # Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
 # Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
@@ -20,13 +17,12 @@
 # segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
 # rides NVLink.
 #
-# How it scales (map-style dataset, NOT the old rank-0-fetch+broadcast): HF
-# Trainer's DistributedSampler shards the corpus across ALL trainer ranks and each
-# rank fetches ONLY its own shard, round-robin across the K serve replicas
-# (data.streaming_server_url is the comma-joined list the script assembles). So
-# trainer nodes scale effective batch / compute and distribute the lustre reads;
-# serve nodes scale data-production throughput (~K x), lifting the single-serve
-# ceiling.
+# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler shards
+# the corpus across ALL trainer ranks and each rank fetches ONLY its own shard,
+# round-robin across the K serve replicas (data.streaming_server_url is the
+# comma-joined list the script assembles). So trainer nodes scale effective batch /
+# compute and distribute the lustre reads; serve nodes scale data-production
+# throughput (~K x), lifting the single-serve ceiling.
 #
 # Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below):
 # per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
index 84a77217453..b2eea8c1ec6 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
@@ -6,10 +6,8 @@
 # aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl
 # to /scratchspace/specdec_bench/.
 #
-# Hardware = GB200/Blackwell (HSG), same reasoning as hf_streaming_eagle3.yaml:
-# Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the 192 GB/GPU of GB200; it fits
-# at TP=4 on ONE 4-GPU node with no cpu-offload. On CW H100 it has no native FP4
-# and falls back to offload, so the working path is GB200.
+# Requires GB200/Blackwell (HSG): Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the
+# 192 GB/GPU of GB200 to fit at TP=4 on ONE 4-GPU node with no cpu-offload.
 #
 # DFLASH specifics:
 #   - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8);

From 221a3aed78270e4a5bffad9e976e43804b1789bd Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 08:27:49 +0000
Subject: [PATCH 12/14] trim comments

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/launch_train.sh |  18 +-
 .../common/eagle3/train_eagle_streaming.sh    | 190 ++++++------------
 tools/launcher/core.py                        |  24 +--
 .../Qwen/Qwen3-8B/hf_streaming_eagle3.yaml    |  23 +--
 .../hf_streaming_eagle3_multi_node.yaml       |  34 +---
 .../Kimi-K2.5/hf_dflash_dryrun.yaml           |  39 +---
 .../Kimi-K2.5/hf_streaming_dflash.yaml        |  78 ++-----
 .../hf_streaming_dflash_multi_node.yaml       | 102 +++-------
 .../Kimi-K2.5/hf_streaming_eagle3.yaml        |  48 ++---
 .../hf_streaming_eagle3_multi_node.yaml       |  74 ++-----
 .../moonshotai/Kimi-K2.5/specdec_bench.yaml   |  55 ++---
 11 files changed, 186 insertions(+), 499 deletions(-)

diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
index fdcc123bb4a..a6104f35fe6 100755
--- a/examples/speculative_decoding/launch_train.sh
+++ b/examples/speculative_decoding/launch_train.sh
@@ -19,9 +19,8 @@
 #   Multi-node:   ./launch_train.sh --config ../../modelopt_recipes/general/speculative_decoding/eagle3.yaml --num_nodes 2 --head_node_ip <IP>
 #   With overrides: ./launch_train.sh --config my.yaml model.model_name_or_path=xxx training.output_dir=yyy
 #
-# Extra key=value args are forwarded as OmegaConf dotlist overrides to main.py.
-# All training config (model, data, hyperparams, eagle, fsdp) lives in the YAML file.
-# Only multi-node routing args are passed here; mixed_precision is fixed to bf16.
+# Extra key=value args are forwarded as OmegaConf dotlist overrides to main.py; all
+# training config lives in the YAML. mixed_precision is fixed to bf16.
 
 set -eo pipefail
 
@@ -48,7 +47,6 @@ if [ -z "$CONFIG_FILE" ]; then
   exit 1
 fi
 
-# GPU count detection
 if [[ "$NUM_NODES" != "1" ]]; then
   GPU_PER_NODE=${GPU_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
   TOTAL_GPU=$((NUM_NODES * GPU_PER_NODE))
@@ -58,15 +56,10 @@ else
   echo "Total GPUs: $TOTAL_GPU (single node)"
 fi
 
-# Multi-node routing args (accelerate only; training config comes from the YAML)
 MULTI_NODE_ARGS=()
 if [[ "$NUM_NODES" != "1" ]]; then
-  # machine_rank defaults to $SLURM_PROCID; pass --machine_rank explicitly when the
-  # allocation reserves node 0 for something else (e.g. a streaming vllm serve).
-  # --multi_gpu is required even at 1 GPU/node -- without it accelerate treats a lone
-  # local process as non-distributed and never forms the process group (each node
-  # would train its own world=1). Use static rendezvous via main_process_ip/port; NOT
-  # --rdzv_backend c10d, which switches to the elastic launcher and ignores it.
+  # --multi_gpu is required even at 1 GPU/node, else accelerate won't form the DDP group.
+  # machine_rank defaults to $SLURM_PROCID; override --machine_rank if node 0 isn't a trainer.
   MULTI_NODE_ARGS=(
     --multi_gpu
     --num_processes "$TOTAL_GPU"
@@ -79,8 +72,7 @@ fi
 
 export TOKENIZERS_PARALLELISM=False
 
-# Run as an argv array (not `sh -c "..."`, which would word-split overrides
-# containing spaces and execute command substitutions embedded in their values).
+# argv array, not `sh -c` (which would word-split overrides and run embedded substitutions).
 CMD=(accelerate launch --mixed_precision bf16
      "${MULTI_NODE_ARGS[@]}"
      "${SCRIPT_DIR}/main.py" --config "$CONFIG_FILE" "${EXTRA_ARGS[@]}")
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index cd6bee89ac0..6f9636c459a 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -15,76 +15,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# EAGLE3 streaming training: runs a `vllm serve` (KV-transfer producer of hidden
-# states) alongside the trainer and routes hidden states over HTTP rather than
-# dumping to disk. Sibling of train_eagle.sh.
+# EAGLE3 streaming training: a `vllm serve` (KV-transfer hidden-states producer)
+# runs alongside the trainer, routing hidden states over HTTP not disk.
 #
-# Topology is chosen automatically from the Slurm allocation (the launcher yaml's
-# `nodes:` field) and $SERVE_NODES; nemo_run runs this script once per node, so it
-# branches on $SLURM_NODEID:
-#   nodes == 1       -> co-located: vllm serve on $SERVE_GPU, trainer on the rest
-#                       of the local GPUs.
-#   nodes >= 2       -> split: Slurm nodes 0..SERVE_NODES-1 each run an independent
-#                       vllm serve replica (whole node); nodes SERVE_NODES..NNODES-1
-#                       are trainers doing multi-node DDP. SERVE_NODES defaults to 1
-#                       (1 serve + N trainers). Rendezvous over the shared
-#                       /scratchspace mount: each serve i publishes its address to
-#                       .serve_addr.i; the head trainer (first trainer node,
-#                       accelerate machine_rank 0) publishes its IP for accelerate's
-#                       rendezvous; trainers collect every serve address.
-#
-# The streaming dataset is map-style: HF Trainer's DistributedSampler shards the
-# corpus across all trainer ranks and each rank fetches ONLY its own shard,
-# round-robin across the SERVE_NODES replicas (data.streaming_server_url is the
-# comma-joined list). Trainer nodes scale compute and distribute the reads; serve
-# nodes scale data-production throughput.
+# CANONICAL TOPOLOGY/DISPATCH (per-example YAMLs cross-reference here). Topology is
+# auto-chosen from the Slurm allocation (yaml `nodes:`) and $SERVE_NODES; nemo_run
+# runs this script once per node, branching on $SLURM_NODEID:
+#   nodes == 1  -> co-located: vllm serve on $SERVE_GPU, trainer on the rest.
+#   nodes >= 2  -> split: nodes 0..SERVE_NODES-1 each run an independent whole-node
+#                 vllm serve replica; nodes SERVE_NODES..NNODES-1 are multi-node-DDP
+#                 trainers. SERVE_NODES default 1. Rendezvous over shared
+#                 /scratchspace: each serve i publishes .serve_addr.i; head trainer
+#                 (first trainer node = accelerate machine_rank 0) publishes its IP;
+#                 trainers collect every serve address.
+# Map-style dataset: DistributedSampler shards the corpus across trainer ranks, each
+# rank fetches only its shard round-robin across the SERVE_NODES replicas
+# (data.streaming_server_url = comma-joined list).
 #
 # Env vars (required):
-#   HF_MODEL_CKPT       Target model path. Used by both vllm serve (as the
-#                       model arg, becomes the served-model-name) and the
-#                       trainer (data.streaming_model_name).
-#   EAGLE_CAPTURE_IDS   JSON list of 1-based layer ids vllm should capture.
-#                       Must equal default_eagle_aux_layer_ids(L) shifted by +1,
-#                       plus the final layer L. For Qwen3-8B (L=36):
-#                       default = [1,17,32] -> capture = [2,18,33,36].
+#   HF_MODEL_CKPT       Target model path; vllm serve model arg (= served-model-name)
+#                       and trainer data.streaming_model_name.
+#   EAGLE_CAPTURE_IDS   JSON 1-based layer ids to capture = default_eagle_aux_layer_ids(L)
+#                       +1, plus final layer L. Qwen3-8B (L=36): [1,17,32]->[2,18,33,36].
 #
 # Env vars (optional):
-#   SERVE_NODES         multi-node only: number of dedicated serve replica nodes
-#                       (Slurm nodes 0..SERVE_NODES-1). default 1.
-#   SERVE_PORT          default 8765
-#   SERVE_GPU_MEM_UTIL  default 0.4 (single-node) / 0.9 (multi-node serve node)
-#   SERVE_READY_TIMEOUT seconds to wait for the server to come up. default 900
-#   SERVE_EXTRA_ARGS    extra flags appended to `vllm serve` (e.g. --trust-remote-code)
-#   SERVE_CPU_OFFLOAD_GB  GB of weights/GPU to offload to host RAM (fits big models
-#                         on too-few GPUs; slower). e.g. "10"
-#   SERVE_MAX_MODEL_LEN   cap vllm context length (trims KV/activation). e.g. "4096"
-#   SERVE_MAX_NUM_SEQS    cap concurrent sequences (trims KV/activation). e.g. "8"
-#   SERVE_HOST          single-node only: bind/connect host. default 127.0.0.1
-#   SERVE_GPU           single-node only: CUDA_VISIBLE_DEVICES for vllm. default "0"
-#   SERVE_TP            tensor-parallel size. default 1 (single-node) / all GPUs
-#                       on the serve node (multi-node)
-#   TRAIN_GPUS          single-node only: CUDA_VISIBLE_DEVICES for the trainer.
-#                       default = all local GPUs except SERVE_GPU.
-#   SERVE_ADVERTISE_IP  multi-node only: address node 1 should dial. default is
-#                       node 0's routable IP (its resolved Slurm node name, else
-#                       its first non-loopback / non-link-local IP).
-#
-# All script args are forwarded to launch_train.sh (typically: --config <yaml>
-# plus OmegaConf dotlist overrides).
+#   SERVE_NODES         multi-node: dedicated serve replica nodes (0..SERVE_NODES-1). default 1
+#   SERVE_GPU_MEM_UTIL  default 0.4 single-node / 0.9 multi-node serve node
+#   SERVE_READY_TIMEOUT server startup wait, seconds. default 900
+#   SERVE_EXTRA_ARGS    extra `vllm serve` flags (e.g. --trust-remote-code)
+#   SERVE_CPU_OFFLOAD_GB  GB/GPU offloaded to host RAM (fits big models on too-few GPUs; slower)
+#   SERVE_MAX_MODEL_LEN   cap context length (trims KV/activation)
+#   SERVE_MAX_NUM_SEQS    cap concurrent sequences (trims KV/activation)
+#   SERVE_HOST          single-node: bind/connect host. default 127.0.0.1
+#   SERVE_GPU           single-node: CUDA_VISIBLE_DEVICES for vllm. default "0"
+#   SERVE_TP            tensor-parallel size. default 1 single-node / all serve-node GPUs
+#   TRAIN_GPUS          single-node: trainer CUDA_VISIBLE_DEVICES. default = all but SERVE_GPU
+#   SERVE_ADVERTISE_IP  multi-node: address node 1 dials. default node 0's routable IP
 
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source "${SCRIPT_DIR}/../service_utils.sh"
 
 ###################################################################################################
-# Container provisioning
-#
-# vllm/vllm-openai:* has vllm and torch but not modelopt or the speculative
-# trainer's deps. modelopt is bind-mounted but has no .dist-info (so
-# `importlib.metadata.version('nvidia-modelopt')` would fail), and nemo_run does
-# not ship the real pyproject.toml, so we synthesize a minimal one and
-# `pip install -e .` to register the dist-info. The setuptools.packages.find
-# `include` must be scoped (modelopt*, modelopt_recipes*) or setuptools sees two
-# top-level packages and fails with a "flat-layout" error.
+# Container provisioning: the vllm image lacks modelopt's .dist-info and the real
+# pyproject, so synthesize a minimal pyproject (scoped `include` avoids setuptools'
+# flat-layout error) and `pip install -e .`.
 
 TOML=modules/Model-Optimizer/pyproject.toml
 if [ ! -f "$TOML" ]; then
@@ -130,25 +104,18 @@ if [ -z "$EAGLE_CAPTURE_IDS" ]; then
     echo "ERROR: EAGLE_CAPTURE_IDS must be set (e.g. '[2, 18, 33, 36]' for Qwen3-8B)." >&2; exit 1
 fi
 
-# Everything passed to this script (--config <yaml> + OmegaConf dotlist) is
-# forwarded verbatim to the trainer. Capture it before the helpers below run.
+# Forwarded verbatim to the trainer; capture before the helpers below run.
 SCRIPT_ARGS=("$@")
 
 SERVE_PORT="${SERVE_PORT:-8765}"
 SERVE_READY_TIMEOUT="${SERVE_READY_TIMEOUT:-900}"
-# Number of dedicated serve replica nodes (multi-node only). Default 1.
 SERVE_NODES="${SERVE_NODES:-1}"
-# All serve replicas share one scratch dir; per-request safetensors files are keyed
-# by a unique vllm request id, so they don't collide across servers.
+# Shared scratch; per-request safetensors keyed by vllm request id, so no collision.
 SERVE_SCRATCH="/scratchspace/streaming_serve_scratch"
 SERVE_LOG="/scratchspace/vllm_serve.log"   # serve nodes override with a per-node path
-# Rendezvous over the shared /scratchspace mount (visible on every node): each serve
-# node i publishes its address to ${SERVE_ADDR_FILE}.i; the head trainer signals
-# completion via DONE_FILE; trainers collect all serve addresses.
-# Namespace the rendezvous/sentinel files per Slurm job so concurrent allocations on
-# the same shared mount don't read/write each other's addresses. SLURM_JOB_ID is
-# identical across every node of one allocation and unique across allocations; falls
-# back to a fixed token off-Slurm (single run).
+# Namespace rendezvous/sentinel files per Slurm job (SLURM_JOB_ID: same across an
+# allocation's nodes, unique across allocations) so concurrent allocations on the
+# shared mount don't clobber each other's addresses. Fixed token off-Slurm.
 RUN_ID="${SLURM_JOB_ID:-local}"
 SERVE_ADDR_FILE="/scratchspace/.serve_addr.${RUN_ID}"
 DONE_FILE="/scratchspace/.training_done.${RUN_ID}"
@@ -164,10 +131,9 @@ cleanup() {
 
 gpus_on_node() { nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n1; }
 
-# Resolve a *routable* IP for this node (other nodes must be able to dial it).
-# `hostname -I` can list a link-local (169.254.x) or loopback address first, so
-# prefer the resolved Slurm node name, then the first non-loopback/non-link-local IP.
-#   $1 = optional override (e.g. SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP)
+# Resolve a routable IP (other nodes must dial it). `hostname -I` can list a
+# link-local/loopback first, so prefer the Slurm node name, then first non-lo/non-ll IP.
+#   $1 = optional override (SERVE_ADVERTISE_IP / TRAINER_ADVERTISE_IP)
 resolve_routable_ip() {
     local ip="$1"
     [ -z "$ip" ] && ip=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1)
@@ -181,25 +147,16 @@ resolve_routable_ip() {
 launch_vllm() {
     local bind_host="$1" tp="$2" cvd="$3"
     echo "Launching vllm serve on ${bind_host}:${SERVE_PORT} (TP=${tp}, CUDA_VISIBLE_DEVICES=${cvd:-all}, mem=${SERVE_GPU_MEM_UTIL}, log: $SERVE_LOG)..."
-    # Only pin GPUs when a non-empty set is given; an empty CUDA_VISIBLE_DEVICES
-    # would expose *zero* GPUs (not all), so leave it unset to use the whole node.
+    # Pin GPUs only for a non-empty set; empty CUDA_VISIBLE_DEVICES hides ALL, so unset = whole node.
     local -a gpu_env=()
     [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
-    # Optional single-value memory knobs (see header), assembled into --flag
-    # value pairs. Each is a space-free env value so it survives nemo_run's
-    # unquoted `export FOO=value`.
+    # Optional memory knobs (see header). Space-free env values to survive nemo_run's unquoted export.
     local -a opt_args=()
     [ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB")
     [ -n "${SERVE_MAX_MODEL_LEN:-}" ]  && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN")
     [ -n "${SERVE_MAX_NUM_SEQS:-}" ]   && opt_args+=(--max-num-seqs "$SERVE_MAX_NUM_SEQS")
-    # --no-enable-chunked-prefill / --no-enable-prefix-caching: the
-    # ExampleHiddenStatesConnector captures hidden states during prefill; both
-    # features skip recomputing cached/partial prefixes, which yields short or
-    # empty hidden_states. Required, not optional.
-    # --no-enable-flashinfer-autotune: on big NVFP4 MoE (Kimi) the flashinfer
-    # autotuner re-tunes on the first real serving step and stalls a worker past
-    # vLLM's execute-model timeout, killing EngineCore and aborting the trainer.
-    # Required there; keeps kernels static.
+    # --no-enable-chunked-prefill / --no-enable-prefix-caching: connector captures hidden states during prefill; both skip recomputing cached/partial prefixes, yielding short/empty hidden_states. Required.
+    # --no-enable-flashinfer-autotune: on NVFP4 MoE the autotuner re-tunes on the first serving step and stalls a worker past vLLM's execute-model timeout, killing EngineCore.
     "${gpu_env[@]}" vllm serve "$HF_MODEL_CKPT" \
         --host "$bind_host" \
         --port "$SERVE_PORT" \
@@ -245,21 +202,16 @@ wait_vllm_ready() {
 
 # Run the trainer then export the HF checkpoint.
 #   $1 = streaming server base URL   $2 = CUDA_VISIBLE_DEVICES ("" -> all)
-# Fetch concurrency comes from the DataLoader's workers (each worker = one in-flight
-# request). STREAMING_NUM_WORKERS sets that; keep it modest so (ranks-per-server x
-# workers) stays near the server's max_num_seqs (flooding a cold NVFP4 MoE server
-# kills EngineCore). 0 disables prefetch (serialized fetches) and is usually too slow.
+# DataLoader workers = in-flight fetches per rank; keep modest so (ranks x workers) stays near the serve's max_num_seqs.
 run_trainer_and_export() {
     local url="$1" cvd="$2"
-    # Optional multi-node trainer routing (see dispatch section). Defaults: single
-    # trainer node, no --num_nodes, export on rank 0.
+    # Optional multi-node trainer routing (see dispatch). Defaults: 1 node, no --num_nodes, export on rank 0.
     local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}"
     echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..."
-    # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all).
+    # Empty cvd -> all GPUs (don't set the var; "" hides all).
     local -a gpu_env=()
     [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd")
-    # Engage accelerate multi-node routing only when >1 trainer node; a single
-    # trainer node omits --num_nodes.
+    # accelerate multi-node routing only when >1 trainer node.
     local -a mn_args=()
     if [ "${num_tnodes}" -gt 1 ]; then
         mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank")
@@ -273,19 +225,15 @@ run_trainer_and_export() {
         training.dataloader_num_workers="${STREAMING_NUM_WORKERS:-4}" \
         || { echo "ERROR: trainer failed." >&2; return 1; }
 
-    # Export only on the head trainer (machine_rank 0); non-head trainer nodes
-    # would race writing the same export dir. The export reads the saved
-    # checkpoint (training.output_dir), not the serve, so it is serve-independent.
+    # Export only on the head trainer (machine_rank 0); non-head nodes would race the same export dir. Export reads training.output_dir, not the serve.
     if [ "${mrank}" -ne 0 ]; then
         echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)."
         return 0
     fi
 
-    # Export the trained draft to HF format. Derive the checkpoint dir from the
-    # forwarded `training.output_dir=` dotlist (defaulting to the EAGLE
-    # convention) so EAGLE and DFlash runs each export their own output_dir.
-    # EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass
-    # --trust_remote_code; empty by default so EAGLE behavior is unchanged.
+    # Derive checkpoint dir from the forwarded training.output_dir= dotlist (EAGLE default)
+    # so EAGLE/DFlash runs each export their own dir. EXPORT_EXTRA_ARGS lets DFlash on a
+    # custom-modeling base (e.g. Kimi) pass --trust_remote_code; empty by default.
     local out_dir
     out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1)
     out_dir="${out_dir:-/scratchspace/eagle3}"
@@ -295,16 +243,12 @@ run_trainer_and_export() {
         ${EXPORT_EXTRA_ARGS:-}
 }
 
-# ---------------------------------------------------------------------------
-# Topology dispatch (see header): nemo_run runs this script once per node, so
-# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads.
-# ---------------------------------------------------------------------------
+# Topology dispatch (see header): branch on $SLURM_NNODES / $SLURM_NODEID.
 NNODES="${SLURM_NNODES:-1}"
 NODEID="${SLURM_NODEID:-0}"
 
-# Multi-node needs at least one trainer node: with SERVE_NODES >= NNODES every node
-# takes the serve branch, so no trainer ever publishes the rendezvous address or the
-# DONE_FILE and the serve nodes block forever. Reject it up front.
+# Need >=1 trainer node: with SERVE_NODES >= NNODES every node takes the serve branch,
+# so nobody publishes the rendezvous/DONE_FILE and serve nodes block forever.
 if [ "$NNODES" -gt 1 ] && [ "$SERVE_NODES" -ge "$NNODES" ]; then
     echo "ERROR: SERVE_NODES ($SERVE_NODES) must be < SLURM_NNODES ($NNODES); need >=1 trainer node." >&2
     exit 1
@@ -336,8 +280,7 @@ PY
 
 elif [ "$NODEID" -lt "$SERVE_NODES" ]; then
     # ---------------------- multi-node: serve node(s) ----------------------
-    # Slurm nodes 0..SERVE_NODES-1 each run an independent vllm serve replica on
-    # their whole node and publish their address to ${SERVE_ADDR_FILE}.${NODEID}.
+    # Each runs a whole-node vllm serve replica and publishes ${SERVE_ADDR_FILE}.${NODEID}.
     SERVE_GPU_MEM_UTIL="${SERVE_GPU_MEM_UTIL:-0.9}"     # dedicated node -> use most of it
     SERVE_TP="${SERVE_TP:-$(gpus_on_node)}"              # default: all GPUs on this node
     SERVE_LOG="/scratchspace/vllm_serve.${NODEID}.log"  # per-node log (avoid collision)
@@ -356,21 +299,19 @@ elif [ "$NODEID" -lt "$SERVE_NODES" ]; then
 
 else
     # -------------------- multi-node: trainer node(s) ----------------------
-    # Serve nodes are 0..SERVE_NODES-1; trainer nodes are SERVE_NODES..NNODES-1,
-    # mapping to 0-based accelerate machine ranks (head trainer = first trainer node).
+    # Trainer nodes SERVE_NODES..NNODES-1 -> 0-based accelerate machine ranks.
     NUM_TRAINER_NODES=$(( NNODES - SERVE_NODES ))
     TRAINER_RANK=$(( NODEID - SERVE_NODES ))
     TRAINER_ADDR_FILE="/scratchspace/.trainer_addr.${RUN_ID}"  # per-job (see RUN_ID)
 
-    # Only the head trainer (rank 0) signals the serve nodes to release on exit;
-    # a non-head node exiting first must NOT tear the serves down early.
+    # Only head trainer (rank 0) signals serves to release on exit; a non-head node
+    # exiting first must NOT tear them down early.
     if [ "$TRAINER_RANK" -eq 0 ]; then
         trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT
         rm -f "$TRAINER_ADDR_FILE"                 # clear stale rendezvous state
     fi
 
-    # Collect every serve replica's address and build the comma-joined URL list the
-    # streaming dataset round-robins across (one fetch per worker, spread over serves).
+    # Collect serve addresses into the comma-joined URL list the dataset round-robins across.
     echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for ${SERVE_NODES} serve address(es)..."
     URLS=""
     for ((s = 0; s < SERVE_NODES; s++)); do
@@ -387,11 +328,10 @@ else
     echo "Trainer rank ${TRAINER_RANK} using serve URLs: ${URLS}"
 
     if [ "$NUM_TRAINER_NODES" -le 1 ]; then
-        # 1 trainer node: single-node DDP (no accelerate multi-node routing).
+        # 1 trainer node: single-node DDP.
         run_trainer_and_export "$URLS" "" || exit 1
     else
-        # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's
-        # rendezvous (port 29500); all trainer nodes read it and join.
+        # >1 trainer node: head publishes its routable IP for accelerate rendezvous (29500); all read and join.
         if [ "$TRAINER_RANK" -eq 0 ]; then
             head_addr=$(resolve_routable_ip "${TRAINER_ADVERTISE_IP:-}")
             echo "$head_addr" > "$TRAINER_ADDR_FILE"
@@ -410,5 +350,3 @@ else
 fi
 
 ###################################################################################################
-
-#exit_handler $0
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index dcdd86aad40..9154c1427bc 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -28,26 +28,6 @@
 import nemo_run as run
 import yaml
 
-__all__ = [
-    "DEFAULT_EXPERIMENT_TITLE",
-    "GlobalVariables",
-    "SandboxPipeline",
-    "SandboxTask",
-    "SandboxTask0",
-    "SandboxTask1",
-    "SandboxTask2",
-    "SandboxTask3",
-    "SandboxTask4",
-    "build_docker_executor",
-    "build_slurm_executor",
-    "create_task_from_yaml",
-    "get_default_env",
-    "register_factory",
-    "report_versions",
-    "run_jobs",
-    "set_slurm_config_type",
-]
-
 # ---------------------------------------------------------------------------
 # Default environment variables injected into every job
 # ---------------------------------------------------------------------------
@@ -277,8 +257,8 @@ def build_slurm_executor(
     # use a LocalTunnel: nemo_run then runs sbatch and copies artifacts via local
     # subprocess/shutil instead of ssh+rsync. This avoids flaky/hanging ssh-to-
     # localhost (e.g. MaxStartups throttling on a shared login node, or clusters
-    # like HSG that are only reachable through an sss proxy so paramiko can't
-    # tunnel in from outside). For real remote hosts, keep the SSHTunnel.
+    # only reachable through a login proxy so paramiko can't tunnel in from
+    # outside). For real remote hosts, keep the SSHTunnel.
     if slurm_config.host in ("localhost", "127.0.0.1"):
         tunnel = run.LocalTunnel(job_dir=job_dir)
     else:
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
index 91b8c54a8b5..d46e0eee68b 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
@@ -2,14 +2,11 @@
 #
 # 3-step pipeline:
 #   task_0: Build input conversations (jsonl)
-#   task_1: Streaming train — vllm serve + trainer; hidden states are fetched
-#           per sample over HTTP (no on-disk dump)
-#   task_2: Benchmark — evaluate speculative decoding speedup via VLLM
+#   task_1: Streaming train — vllm serve + trainer; hidden states fetched over HTTP
+#   task_2: Benchmark — speculative decoding speedup via VLLM
 #
-# task_1 here uses the multi-node split (nodes=2): node 0 runs vllm serve, node 1
-# runs the trainer; they rendezvous via the shared /scratchspace mount. (Set
-# nodes=1 to co-locate both on one node instead.) All tasks share /scratchspace
-# to pass artifacts between steps.
+# task_1 uses nodes=2: node 0 runs vllm serve, node 1 the trainer. Tasks share
+# /scratchspace to pass artifacts.
 #
 # Usage:
 #   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml --yes
@@ -23,7 +20,6 @@ pipeline:
   global_vars:
     hf_model: /hf-local/Qwen/Qwen3-8B
 
-  # Step 1: Build input conversations
   task_0:
     script: common/eagle3/make_dataset.sh
     args:
@@ -36,11 +32,8 @@ pipeline:
       gpus_per_node: 1
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
-  # Step 2: Streaming EAGLE3 training
-  #
-  # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32];
-  # vllm capture ids are those shifted by +1, plus the final layer:
-  #   [2, 18, 33] + [36] = [2, 18, 33, 36].
+  # capture ids = default_eagle_aux_layer_ids(36)=[1,17,32] shifted +1, plus final
+  # layer 36 -> [2,18,33,36].
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -56,8 +49,7 @@ pipeline:
       - eagle.eagle_use_torch_compile=false
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
-      # No spaces: nemo_run emits `export FOO=value` without quotes, so a
-      # space-separated value would be split by the shell.
+      # No spaces: nemo_run emits unquoted `export FOO=value`, so spaces would split.
       - EAGLE_CAPTURE_IDS: "[2,18,33,36]"
       - SERVE_TP: "1"
     slurm_config:
@@ -67,7 +59,6 @@ pipeline:
       gpus_per_node: 1
       container: vllm/vllm-openai:latest
 
-  # Step 3: Benchmark speculative decoding (VLLM backend)
   task_2:
     script: common/specdec_bench/quick_check.sh
     args:
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
index d0c99f6f0be..3751ecbe96a 100644
--- a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
@@ -1,22 +1,7 @@
 # EAGLE3 streaming speculative decoding pipeline for Qwen3-8B — MULTI-NODE.
 #
-# Multi-node sibling of hf_streaming_eagle3.yaml. Both the serve and trainer sides
-# scale across multiple nodes, with multiple GPUs per node. task_1 allocates 4
-# nodes x 2 GPUs: SERVE_NODES (=2) run independent vllm serve replicas (TP=2 each),
-# the remaining 2 run multi-node-DDP trainers (2 GPUs each -> world_size=4). Tune
-# the split via slurm_config.nodes/gpus_per_node + the SERVE_NODES env:
-#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
-#
-# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
-# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
-# K..N-1 are trainers. The head trainer publishes its IP for accelerate's
-# rendezvous; every trainer reads all serve addresses and joins the DDP group.
-#
-# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler
-# shards the corpus across ALL trainer ranks and each rank fetches ONLY its own
-# shard, round-robin across the K serve replicas (data.streaming_server_url is the
-# comma-joined list). Trainer nodes scale effective batch / compute; serve nodes
-# scale data-production throughput (~K x).
+# task_1 splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES;
+# see common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
 #
 # 3-step pipeline:
 #   task_0: Build input conversations (jsonl)
@@ -49,12 +34,7 @@ pipeline:
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
-  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
-  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
-  #
-  # Qwen3-8B has 36 hidden layers; default_eagle_aux_layer_ids(36) = [1, 17, 32];
-  # vllm capture ids are those shifted by +1, plus the final layer:
-  #   [2, 18, 33] + [36] = [2, 18, 33, 36].
+  # Capture ids: default_eagle_aux_layer_ids(36)=[1,17,32] +1, plus final layer 36.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -70,16 +50,12 @@ pipeline:
       - eagle.eagle_use_torch_compile=false
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
-      # No spaces: nemo_run emits `export FOO=value` without quotes, so a
-      # space-separated value would be split by the shell.
+      # No spaces: nemo_run emits `export FOO=value` unquoted.
       - EAGLE_CAPTURE_IDS: "[2,18,33,36]"
-      # Each serve node has 2 GPUs -> TP=2.
       - SERVE_TP: "2"
       # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
       - SERVE_NODES: "2"
-      # Per-serve in-flight requests = (trainer ranks) x STREAMING_NUM_WORKERS / SERVE_NODES.
-      # Here 4 ranks x 4 / 2 serves = 8 concurrent per serve — fine for Qwen's
-      # max_num_seqs.
+      # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
       - STREAMING_NUM_WORKERS: "4"
     slurm_config:
       _factory_: "slurm_factory"
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
index 47ef2950b95..5cb467b3f6a 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml
@@ -1,33 +1,6 @@
-# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4).
-#
-# Single-task pipeline that exercises the full convert→save→export path WITHOUT
-# actually training. Uses the same `common/specdec/dflash_online_training.sh`
-# entrypoint as a real DFlash run; all dry-run behaviour is expressed as dotlist
-# overrides on `main.py` (shared with EAGLE3 — `--dry_run` is mode-agnostic):
-#
-#   --dry_run                              → main.py skips trainer.train(), saves
-#                                            the (untrained) ModelOpt checkpoint
-#                                            to training.output_dir right after
-#                                            mtsp.convert(model, [("dflash", ...)])
-#   data.offline_data_path=<placeholder>   → setting an offline path makes
-#                                            mode='offline' → use_offline_training
-#                                            =True. Combined with
-#                                            use_fake_base_for_offline=true this
-#                                            loads a FakeBaseModel (only
-#                                            embed_tokens + lm_head), so the ~1T
-#                                            MoE base fits on a single GPU. The
-#                                            file is never read in --dry_run mode.
-#   model.trust_remote_code=true           → Kimi-K2.5 (deepseek_v3 arch) ships a
-#                                            custom modeling file
-#   dflash.dflash_mask_token_id=163838     → Kimi-K2.5 has no dedicated mask token;
-#                                            163838 is a reserved slot used as the
-#                                            DFlash mask
-#
-# The dflash_online_training.sh export block then writes an HF-format DFlash draft
-# to /scratchspace/dflash/exported-checkpoint-final with the correct architecture
-# (5-layer draft block, block_size=8) but untrained weights — acceptance ~0%, by
-# design. Useful for smoke-testing the launcher / convert / export plumbing and
-# validating downstream loaders without a real training run.
+# DFlash dry-run smoke test for Kimi-K2.5 (NVFP4): exercises the full
+# convert->save->export path WITHOUT training, to validate launcher/export
+# plumbing and downstream loaders. Exported draft has untrained weights.
 #
 # Usage:
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_dflash_dryrun.yaml --yes
@@ -41,18 +14,22 @@ pipeline:
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4/
 
-  # Convert → save → export (no training).
+  # Convert -> save -> export (no training).
   task_0:
     script: common/specdec/dflash_online_training.sh
     args:
+      # Skips trainer.train(), saves the untrained checkpoint right after convert.
       - --dry_run
       - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
+      # FakeBaseModel (embed + lm_head only) so the base fits one GPU; never read in dry-run.
       - model.use_fake_base_for_offline=true
       - model.trust_remote_code=true
+      # An offline path forces mode=offline; value unused in dry-run.
       - data.offline_data_path=/tmp/dryrun-placeholder
       - training.output_dir=/scratchspace/dflash
       - training.disable_tqdm=true
+      # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask.
       - dflash.dflash_mask_token_id=163838
     slurm_config:
       _factory_: "slurm_factory"
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
index 62e3c742e65..50475b1a28e 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
@@ -1,39 +1,22 @@
 # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
-# GB200/Blackwell (HSG). Sibling of hf_streaming_eagle3.yaml — same vLLM-serve +
-# trainer split, but trains a DFlash drafter instead of EAGLE3 by pointing the
-# shared, algorithm-agnostic streaming script at the dflash recipe.
+# GB200/Blackwell: node 0 = vllm serve (TP=4, whole node), node 1 = DFlash
+# trainer. See common/eagle3/train_eagle_streaming.sh header for the mechanism.
 #
-# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4
-# model fit at TP=4 on ONE 4-GPU node with no cpu-offload. Topology: node 0 =
-# vllm serve (TP=4, whole node), node 1 = DFlash trainer (fake base), 2 nodes.
+# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi at TP=4 on one node.
 #
-# How streaming feeds DFlash: data.mode=streaming sets dflash_offline=True
-# (derived in modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline),
-# so the DFlash module consumes the streamed hidden states (base_model_outputs)
-# instead of running the fake base. vLLM dumps captured layers as
-# [seq, n_captured, hidden]; the dataset splits the LAST captured layer into
-# base_model_hidden_states (DFlash self-logit distillation) and the REST into
-# aux_hidden_states (DFlash's concatenated target-layer features). So n_captured
-# must be (num DFlash target layers + 1).
+# data.mode=streaming sets dflash_offline so the DFlash module consumes streamed
+# hidden states instead of running the fake base.
+# Capture ids = [2,16,31,45,59,60] (kimi_k25/deepseek_v3, 61 layers): 5 DFlash
+# target layers + base 60. n_captured = num_target_layers + 1.
 #
-# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
-# the true final layer is NOT capturable so we use 60 as the base):
-#   DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5)
-#   = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) =
-#   [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the
-#   dataset yields 5 aux layers, matching the 5-layer DFlash draft block.
+# answer_only_loss forced false: Kimi's chat template lacks {% generation %} tags
+# needed to derive the assistant-token mask; flip on with a tagged template.
 #
-# answer_only_loss: forced false here. DFlash's recipe default is true, which
-# requires the tokenizer chat template to carry {% generation %} tags so the
-# streaming dataset can derive an assistant-token mask; Kimi's template does not,
-# and the streaming path does not inject data.chat_template. To train
-# assistant-only later, supply a generation-tagged template and flip this on.
-#
-# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
-#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
 #          SLURM_PARTITION=batch \
-#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
-#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          SLURM_HF_LOCAL=<hf_models_dir> \
+#          SLURM_JOB_DIR=<experiments_dir> \
 #          NEMORUN_HOME=$PWD
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
@@ -50,7 +33,6 @@ pipeline:
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
 
-  # Step 1: Build input conversations (model-agnostic)
   task_0:
     script: common/eagle3/make_dataset.sh
     args:
@@ -60,13 +42,10 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
-      # so request 4 even though make_dataset is CPU-only.
+      # The cluster QOS requires whole-node GPU allocation though make_dataset is CPU-only.
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
-  # Step 2: Streaming DFlash training (node0 serve TP=4 / node1 train), 4 GPU/node.
-  # Reuses the shared streaming orchestrator common/eagle3/train_eagle_streaming.sh.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -77,52 +56,39 @@ pipeline:
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
       - training.output_dir=/scratchspace/dflash
-      # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
+      # Must be divisible by dflash_block_size (8).
       - training.training_seq_len=4096
       - training.disable_tqdm=true
       - training.num_train_epochs=1
       - training.max_steps=3000
-      # See header: Kimi's template lacks {% generation %} tags; train on all tokens.
+      # Kimi's template lacks {% generation %} tags; train on all tokens (see header).
       - training.answer_only_loss=false
-      # dflash.yaml sets report_to=tensorboard, but the vLLM container has no
-      # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable
-      # reporting (loss still prints to stdout via logging_steps).
+      # vLLM container has no tensorboard (dflash.yaml's default report_to); disable.
       - training.report_to=none
-      # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838
-      # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run).
+      # Kimi-K2.5 has no dedicated mask token; 163838 is a reserved slot used as one.
       - dflash.dflash_mask_token_id=163838
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
       # No spaces in values: nemo_run emits `export FOO=value` unquoted.
-      # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
       - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
       - SERVE_TP: "4"
-      # DataLoader workers per trainer rank = in-flight requests per rank. All
-      # trainer ranks fetch, so per-serve in-flight = trainer_world_size(4) x
-      # STREAMING_NUM_WORKERS. Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood
-      # stalls a cold NVFP4-MoE worker past vLLM's timeout and kills EngineCore.
+      # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
       - STREAMING_NUM_WORKERS: "1"
       # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
-      # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context
-      # to the training seq len, leaving headroom for activation spikes.
+      # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4.
       - SERVE_MAX_MODEL_LEN: "4096"
-      # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # A worker can stall on the first real serving step (cold NVFP4 MoE kernels)
-      # past vLLM's default execute-model timeout, killing EngineCore. Extend the
-      # timeouts (seconds) that govern that path.
+      # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds).
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 2
-      # Pin the serve node + trainer node into one NVL72 block. Inter-node here is
-      # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this
-      # is a latency/locality nicety rather than a correctness requirement.
+      # Pin nodes into one NVL72 block (latency/locality; inter-node is HTTP + lustre, not NCCL).
       segment: 2
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
index 703f636d53b..fedf729a038 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -1,59 +1,20 @@
 # DFlash streaming speculative-decoding training for Kimi-K2.5-NVFP4 on
-# GB200/Blackwell (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of
-# hf_streaming_dflash.yaml: BOTH sides scale out. SERVE_NODES (=2) run independent
-# vllm serve replicas (TP=4 each, whole node); the remaining nodes run multi-node-DDP
-# trainers (4 GPUs each). This file allocates 4 nodes = 2 serve + 2 trainer
-# (world_size=8). Tune the split via slurm_config.nodes/segment + SERVE_NODES:
-#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
+# GB200/Blackwell — MULTI-NODE: both serve and trainer sides scale out.
+# nodes=N, SERVE_NODES=K -> K serve replicas (TP=4, whole node) + (N-K) trainer
+# nodes. See common/eagle3/train_eagle_streaming.sh for dispatch/sharding/scaling.
 #
-# Why GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB Kimi-K2.5-NVFP4 model fit
-# at TP=4 on ONE 4-GPU node with no cpu-offload. So each serve replica owns a whole
-# node at TP=4, and each trainer node uses all 4 GPUs for the draft (fake base).
+# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi-K2.5-NVFP4 at TP=4
+# on one 4-GPU node, so each serve replica owns a whole node.
 #
-# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
-# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
-# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d
-# rendezvous; every trainer reads all K serve addresses and joins the DDP group.
-# segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
-# rides NVLink.
+# Capture ids: build_target_layer_ids(num_orig=61, num_draft=5)=[1,15,30,44,58]
+# -> +1 for embedding = [2,16,31,45,59], append base 60 (final layer uncapturable).
+# 6 captured = 5 aux layers, matching the 5-layer DFlash draft block.
 #
-# How it scales: HF Trainer's DistributedSampler shards the corpus across ALL
-# trainer ranks and each rank fetches ONLY its own shard, round-robin across the K
-# serve replicas (data.streaming_server_url is the comma-joined list the script
-# assembles). Trainer nodes scale effective batch / compute and distribute the
-# lustre reads; serve nodes scale data-production throughput (~K x).
-#
-# Concurrency: keep it low for cold NVFP4 MoE (see SERVE_MAX_NUM_SEQS below).
-# Per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
-# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4.
-#
-# How streaming feeds DFlash: data.mode=streaming derives dflash_offline=True
-# (modelopt/recipe/config.py ModelOptDFlashRecipe._derive_dflash_offline), so the
-# DFlash module consumes the streamed hidden states (base_model_outputs) instead of
-# running the fake base. vLLM dumps captured layers as [seq, n_captured, hidden];
-# the dataset splits the LAST captured layer into base_model_hidden_states (DFlash
-# self-logit distillation) and the REST into aux_hidden_states (DFlash's
-# concatenated target-layer features). So n_captured must be (num DFlash target
-# layers + 1).
-#
-# Capture ids (kimi_k25 / deepseek_v3 arch, 61 layers, capture id space 0..60;
-# the true final layer is NOT capturable so we use 60 as the base, same as EAGLE3):
-#   DFlash target layers come from build_target_layer_ids(num_orig=61, num_draft=5)
-#   = [1,15,30,44,58] (0-based) -> vLLM ids (+1 for the embedding layer) =
-#   [2,16,31,45,59]. Append base 60. captured = [2,16,31,45,59,60] = 6, so the
-#   dataset yields 5 aux layers, matching the 5-layer DFlash draft block.
-#
-# answer_only_loss: forced false here. DFlash's recipe default is true, which
-# requires the tokenizer chat template to carry {% generation %} tags so the
-# streaming dataset can derive an assistant-token mask; Kimi's template does not,
-# and the streaming path (unlike online) does not inject data.chat_template. To
-# train assistant-only later, supply a generation-tagged template and flip this on.
-#
-# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
-#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
 #          SLURM_PARTITION=batch \
-#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
-#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          SLURM_HF_LOCAL=<hf_models_dir> \
+#          SLURM_JOB_DIR=<experiments_dir> \
 #          NEMORUN_HOME=$PWD
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
@@ -70,7 +31,7 @@ pipeline:
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
 
-  # Step 1: Build input conversations (model-agnostic)
+  # Build input conversations.
   task_0:
     script: common/eagle3/make_dataset.sh
     args:
@@ -80,14 +41,11 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
-      # so request 4 even though make_dataset is CPU-only.
+      # The cluster QOS requires whole-node GPU alloc even though make_dataset is CPU-only.
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
-  # Step 2: Streaming DFlash training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each).
-  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
-  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
+  # Streaming DFlash training: 2 serve replicas (TP=4) + 2 trainer nodes.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -98,51 +56,41 @@ pipeline:
       - data.mode=streaming
       - data.data_path=/scratchspace/data/train.jsonl
       - training.output_dir=/scratchspace/dflash
-      # Must be divisible by dflash_block_size (8). DFlash trains on fixed-length blocks.
+      # Must be divisible by dflash_block_size (8).
       - training.training_seq_len=4096
       - training.disable_tqdm=true
       - training.ar_validate_steps=500000
       - training.num_train_epochs=1
       - training.max_steps=500
-      # See header: Kimi's template lacks {% generation %} tags; train on all tokens.
+      # Kimi's template lacks {% generation %} tags, so train on all tokens.
       - training.answer_only_loss=false
-      # dflash.yaml sets report_to=tensorboard, but the vLLM container has no
-      # tensorboard -> TensorBoardCallback RuntimeError at trainer init. Disable
-      # reporting (loss still prints to stdout via logging_steps).
+      # vLLM container has no tensorboard (dflash.yaml's default) -> init crash.
       - training.report_to=none
-      # Kimi-K2.5 has no dedicated mask token ([EOS]=163585, [PAD]=163839); 163838
-      # is a reserved slot used as the DFlash mask (matches the real Kimi DFlash run).
+      # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask.
       - dflash.dflash_mask_token_id=163838
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
-      # No spaces in values: nemo_run emits `export FOO=value` unquoted.
-      # DFlash target layers (vLLM-indexed) + base 60; see header for derivation.
+      # See header for derivation.
       - EAGLE_CAPTURE_IDS: "[2,16,31,45,59,60]"
-      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
       - SERVE_NODES: "2"
-      # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi).
       - SERVE_TP: "4"
-      # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1:
-      # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS.
+      # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
       - STREAMING_NUM_WORKERS: "1"
-      # DFlash on a custom-modeling base (Kimi) needs --trust_remote_code at export.
+      # Kimi's custom-modeling base needs --trust_remote_code at export.
       - EXPORT_EXTRA_ARGS: "--trust_remote_code"
-      # The model's native max_seq_len is 262144, whose KV cache OOMs. Cap context
-      # to the training seq len and leave headroom for activation spikes.
+      # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4.
       - SERVE_MAX_MODEL_LEN: "4096"
-      # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # A worker can stall on the first real serving step (cold NVFP4 MoE kernels)
-      # past vLLM's default execute-model timeout, killing EngineCore. Extend the
-      # timeouts that govern that path (seconds).
+      # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds).
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 4
+      # Pin nodes into one NVL72 block (essential for cross-node trainer DDP).
       segment: 4
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
index 3f6cf23d1b7..5ace5e83847 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml
@@ -1,19 +1,16 @@
-# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell (HSG).
+# EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell.
 #
-# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE
-# 4-GPU node with no cpu-offload (on H100 it needs offload and is too slow to be
-# usable). Topology: node 0 = vllm serve (TP=4, whole node), node 1 = EAGLE3 trainer
-# (fake base); 4 GPUs each, 2 nodes.
+# Requires GB200: native NVFP4 + 192 GB/GPU fits the ~551 GB model at TP=4 on one node.
+# node 0 = vllm serve (TP=4), node 1 = EAGLE3 trainer (fake base); 4 GPUs each.
 #
-# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed
-# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60.
-# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base.
+# Capture ids: deepseek_v3 arch, 61 layers, indexed by layer input (0..60);
+# [2,30,58] aux + [60] base (final layer not capturable).
 #
-# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
-#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
 #          SLURM_PARTITION=batch \
-#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
-#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          SLURM_HF_LOCAL=<hf_models_dir> \
+#          SLURM_JOB_DIR=<experiments_dir> \
 #          NEMORUN_HOME=$PWD
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
@@ -37,8 +34,7 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
-      # so request 4 even though make_dataset is CPU-only.
+      # The cluster QOS requires whole-node GPU alloc (4) even though make_dataset is CPU-only.
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
@@ -61,38 +57,24 @@ pipeline:
       - eagle.eagle_use_torch_compile=false
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
-      # No spaces in values: nemo_run emits `export FOO=value` unquoted.
+      # No spaces: nemo_run emits `export FOO=value` unquoted.
       - EAGLE_CAPTURE_IDS: "[2,30,58,60]"
       - SERVE_TP: "4"
-      # DataLoader workers per trainer rank = in-flight requests per rank. The
-      # streaming dataset is map-style: ALL trainer ranks fetch (not just rank 0),
-      # so per-serve in-flight = trainer_world_size(4) x STREAMING_NUM_WORKERS.
-      # Keep at 1 -> 4 = SERVE_MAX_NUM_SEQS; a wider flood stalls a cold NVFP4-MoE
-      # worker past vLLM's engine<->worker timeout, killing EngineCore -> trainer abort.
+      # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
       - STREAMING_NUM_WORKERS: "1"
-      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
-      # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with
-      # 183/184 GB used). Cap context to the training seq len and leave headroom
-      # for activation spikes during the profiling forward.
+      # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4.
       - SERVE_MAX_MODEL_LEN: "4096"
-      # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
-      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
-      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
-      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds).
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 2
-      # Pin the serve node + trainer node into one NVL72 block. Inter-node here is
-      # HTTP hidden-state fetch + shared /scratchspace (lustre), not NCCL, so this
-      # is a latency/locality nicety rather than a correctness requirement (cf. the
-      # multi_node examples, where cross-node trainer DDP makes segment essential).
+      # Pin nodes into one NVL72 block (latency nicety here; essential when trainers do cross-node DDP).
       segment: 2
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
index bddf6b06909..e57c78f3cc1 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
@@ -1,43 +1,17 @@
 # EAGLE3 streaming speculative decoding for Kimi-K2.5-NVFP4 on GB200/Blackwell
-# (HSG) — MULTI-NODE (multi-serve). Multi-node sibling of hf_streaming_eagle3.yaml:
-# BOTH sides scale out. SERVE_NODES (=2) run independent vllm serve replicas (TP=4
-# each, whole node); the remaining nodes run multi-node-DDP trainers (4 GPUs each).
-# This file allocates 4 nodes = 2 serve + 2 trainer (world_size=8). Tune the split
-# via slurm_config.nodes/segment + SERVE_NODES:
-#   nodes=N, SERVE_NODES=K  ->  K serve replicas + (N-K) trainer nodes.
+# MULTI-NODE: K serve replicas (TP=4, whole node) + (N-K) DDP trainer nodes.
+# This file: nodes=4, SERVE_NODES=2 -> 2 serve + 2 trainer. See dispatch/scaling in
+# common/eagle3/train_eagle_streaming.sh header.
 #
-# Requires GB200: native NVFP4 + 192 GB/GPU lets the ~551 GB model fit at TP=4 on ONE
-# 4-GPU node with no cpu-offload. Each serve replica owns a whole node at TP=4; each
-# trainer node uses all 4 GPUs for the draft (fake base).
+# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi at TP=4 on one node.
+# Capture ids = [2,30,58] aux + [60] base = 4 (kimi_k25/deepseek_v3, 61 layers;
+# layer 60 is the last capturable, used as base).
 #
-# Topology (see common/eagle3/train_eagle_streaming.sh header for the dispatch):
-# Slurm nodes 0..K-1 each serve and publish their address via /scratchspace; nodes
-# K..N-1 are trainers. The head trainer publishes its IP for accelerate's c10d
-# rendezvous; every trainer reads all K serve addresses and joins the DDP group.
-# segment=<nodes> pins all nodes into one NVL72 block so inter-node DDP traffic
-# rides NVLink.
-#
-# How it scales: the dataset is map-style, so HF Trainer's DistributedSampler shards
-# the corpus across ALL trainer ranks and each rank fetches ONLY its own shard,
-# round-robin across the K serve replicas (data.streaming_server_url is the
-# comma-joined list the script assembles). So trainer nodes scale effective batch /
-# compute and distribute the lustre reads; serve nodes scale data-production
-# throughput (~K x), lifting the single-serve ceiling.
-#
-# Concurrency (must stay low for cold NVFP4 MoE — see SERVE_MAX_NUM_SEQS below):
-# per-serve in-flight requests = world_size x STREAMING_NUM_WORKERS / SERVE_NODES
-# = 8 x 1 / 2 = 4, matching SERVE_MAX_NUM_SEQS=4. Flooding a cold NVFP4-MoE server
-# stalls a worker past vLLM's execute-model timeout and kills EngineCore.
-#
-# Capture ids: kimi_k25 (deepseek_v3 arch), 61 layers. aux states are indexed
-# by layer INPUT (0..60); the final layer is NOT capturable, so the base is 60.
-# captured = [2,30,58] aux + [60] base = 4, matching the trainer's 3-aux+base.
-#
-# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
-#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
 #          SLURM_PARTITION=batch \
-#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
-#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          SLURM_HF_LOCAL=<hf_models_dir> \
+#          SLURM_JOB_DIR=<experiments_dir> \
 #          NEMORUN_HOME=$PWD
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
@@ -51,7 +25,6 @@ pipeline:
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
 
-  # Step 1: Build input conversations (model-agnostic)
   task_0:
     script: common/eagle3/make_dataset.sh
     args:
@@ -61,14 +34,10 @@ pipeline:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      # HSG QOS (QOSMinGRES) requires whole-node GPU allocation (4 on GB200),
-      # so request 4 even though make_dataset is CPU-only.
+      # The cluster QOS requires whole-node GPU allocation though make_dataset is CPU-only.
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
-  # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=4) + 2 trainer nodes (4 GPU each).
-  # Reuses the shared streaming orchestrator (common/eagle3/train_eagle_streaming.sh);
-  # SERVE_NODES + nodes select the K-serve + (N-K)-trainer multi-node topology.
   task_1:
     script: common/eagle3/train_eagle_streaming.sh
     args:
@@ -89,40 +58,28 @@ pipeline:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
       # No spaces in values: nemo_run emits `export FOO=value` unquoted.
       - EAGLE_CAPTURE_IDS: "[2,30,58,60]"
-      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
       - SERVE_NODES: "2"
-      # Each serve node has 4 GB200 GPUs -> TP=4 (whole node, the only fit for Kimi).
       - SERVE_TP: "4"
-      # DataLoader workers per trainer rank = in-flight requests per rank. Keep at 1:
-      # per-serve in-flight = world_size(8) x 1 / SERVE_NODES(2) = 4 = SERVE_MAX_NUM_SEQS.
-      # A wider flood stalls a cold NVFP4-MoE worker past vLLM's engine<->worker
-      # timeout, killing EngineCore (TimeoutError) mid-serve -> 500s -> trainer abort.
+      # Per-rank in-flight fetches; keep low so the cold NVFP4-MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
       - STREAMING_NUM_WORKERS: "1"
-      # Kimi-K2.5-NVFP4 ~138 GB weights/GPU at TP=4; GB200 has 184 GB. The model's
-      # native max_seq_len is 262144, whose KV cache OOMs (first attempt died with
-      # 183/184 GB used). Cap context to the training seq len and leave headroom
-      # for activation spikes during the profiling forward.
+      # Cap context to the train seq len; the model's native 262144 KV-cache OOMs at TP=4.
       - SERVE_MAX_MODEL_LEN: "4096"
-      # Small batches: smaller per-step MoE compute stays under the engine timeout.
       - SERVE_MAX_NUM_SEQS: "4"
       - SERVE_GPU_MEM_UTIL: "0.8"
       - SERVE_READY_TIMEOUT: "2400"
       - SERVE_EXTRA_ARGS: "--trust-remote-code"
-      # The killer was "RPC call to sample_tokens timed out" — a worker stalls on
-      # the first real serving step (cold NVFP4 MoE kernels) past vLLM's default
-      # execute-model timeout, so EngineCore dies. Extend the timeouts that govern
-      # that path (seconds). VLLM_RPC_TIMEOUT (ms) is a different RPC and didn't help.
+      # Cold NVFP4-MoE kernels stall the first serving step past vLLM's default execute-model timeout; raise it (seconds).
       - VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "1200"
       - VLLM_ENGINE_ITERATION_TIMEOUT_S: "1200"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 4
+      # Pin nodes into one NVL72 block (essential for cross-node trainer DDP).
       segment: 4
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
-  # Step 3: Benchmark speculative decoding (VLLM backend, Kimi served at TP=4)
   task_2:
     script: common/specdec_bench/quick_check.sh
     args:
@@ -135,7 +92,6 @@ pipeline:
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
       - --concurrency 32
-      # Kimi has custom modeling code; bench run.py loads base+tokenizer and needs this.
       - --trust_remote_code
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
index b2eea8c1ec6..7c37015d90d 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/specdec_bench.yaml
@@ -1,44 +1,26 @@
-# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM.
+# DFLASH speculative-decoding benchmark for Kimi-K2.5-NVFP4 via vLLM (in-process
+# AsyncLLM, TP=4 + EP), benchmarking on MT-Bench. Outputs to /scratchspace/specdec_bench/.
 #
-# Serves Kimi-K2.5-NVFP4 in-process (no HTTP server — specdec_bench drives an
-# AsyncLLM) at TP=4 with expert parallelism, attaches a trained/exported DFLASH
-# draft, and benchmarks speculative decoding on MT-Bench. Writes timing.json +
-# aa_timing.json + acceptance_rate.json + mtbench.json + specbench_responses.jsonl
-# to /scratchspace/specdec_bench/.
+# Requires GB200: native NVFP4 + 192 GB/GPU fits ~551 GB Kimi-K2.5-NVFP4 at TP=4
+# on one 4-GPU node.
 #
-# Requires GB200/Blackwell (HSG): Kimi-K2.5-NVFP4 (~551 GB) needs native NVFP4 + the
-# 192 GB/GPU of GB200 to fit at TP=4 on ONE 4-GPU node with no cpu-offload.
+# DFLASH: draft tokens default to 8 (=block_size); --draft_length does NOT apply.
+# To override sampling/engine args, add `- --runtime_params <yaml>` (see
+# examples/specdec_bench/README.md).
 #
-# DFLASH specifics:
-#   - draft tokens default to 8 in specdec_bench (matches DFlash block_size=8);
-#     --draft_length does NOT apply to DFLASH. To override sampling / engine args
-#     (e.g. speculative_num_draft_tokens, temperature), write a runtime-params
-#     yaml and add `- --runtime_params <path>` below — see
-#     examples/specdec_bench/README.md (runtime_args_long_context.yaml pattern).
-#   - --draft_model_dir must point at a trained+exported HF-format DFLASH draft
-#     (e.g. produced by hf_offline_dflash.yaml / a real DFlash run). Set it via the
-#     `draft_model_dir` global_var below, or override on the CLI:
-#     pipeline.global_vars.draft_model_dir=/hf-local/<draft>
-#   - Kimi needs --trust_remote_code for both tokenizer and model.
+# NOTE on dataset: MT-Bench needs no data-prep. For SPEED-Bench instead, first run
+# `prepare_data.py --dataset speed --config all`, then replace --mtbench with
+# `--dataset speed` + `--dataset_path .../data/speed/<split>`.
 #
-# NOTE on dataset: uses MT-Bench (the question.jsonl staged under /hf-local), so
-# it runs without any data-prep step. To benchmark on SPEED-Bench instead, first
-# generate + stage a split:
-#     python3 examples/specdec_bench/prepare_data.py --dataset speed --config all
-# (splits: qualitative, throughput_1k, throughput_16k, ...) then swap the
-# `--mtbench` arg for:
-#     - --dataset speed
-#     - --dataset_path modules/Model-Optimizer/examples/specdec_bench/data/speed/throughput_16k
+# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH; on
+# GB200/aarch64 use an aarch64 DFLASH-capable image (e.g. a 0511 nightly), via
+# pipeline.task_0.slurm_config.container=<image>. UNRESOLVED.
 #
-# NOTE on container: vllm/vllm-openai:latest is x86 and may lack DFLASH support;
-# on GB200/aarch64 use an aarch64 vLLM image new enough for DFLASH (validated on
-# a 0511 nightly). Override with: pipeline.task_0.slurm_config.container=<image>
-#
-# Run ON the HSG login node (paramiko can't reach HSG through the sss proxy):
-#   export SLURM_HOST=localhost SLURM_ACCOUNT=coreai_dlalgo_modelopt \
+# Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
+#   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
 #          SLURM_PARTITION=batch \
-#          SLURM_HF_LOCAL=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local \
-#          SLURM_JOB_DIR=/lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_modelopt/users/haoguo/experiments \
+#          SLURM_HF_LOCAL=<hf_models_dir> \
+#          SLURM_JOB_DIR=<experiments_dir> \
 #          NEMORUN_HOME=$PWD
 #   uv run launch.py --yaml examples/moonshotai/Kimi-K2.5/specdec_bench.yaml \
 #          identity=$HOME/.ssh/id_ecdsa detach=True --yes
@@ -52,8 +34,7 @@ pipeline:
 
   global_vars:
     hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
-    # Trained + exported HF-format DFLASH draft checkpoint. Defaults to the standard
-    # export path; override on the CLI with: pipeline.global_vars.draft_model_dir=<path>
+    # Trained+exported DFLASH draft; override: pipeline.global_vars.draft_model_dir=<path>
     draft_model_dir: /hf-local/nvidia/Kimi-K2.5-DFlash
 
   task_0:

From 9760dec8eee9e568d415524a94d803839eda377e Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Thu, 4 Jun 2026 23:54:27 +0000
Subject: [PATCH 13/14] address comments

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 .../speculative/plugins/hf_streaming_dataset.py     | 13 +++++++++++--
 .../launcher/common/eagle3/train_eagle_streaming.sh |  7 ++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
index 65b2cd4f0d7..1dd38b64154 100644
--- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
+++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
@@ -374,7 +374,9 @@ def _client(self) -> httpx.Client:
 
         DataLoader workers are forked processes; httpx connection pools must not be
         shared across a fork, so each process gets its own client (and its own
-        round-robin cursor over ``server_urls``), keyed by PID.
+        round-robin cursor over ``server_urls``), keyed by PID. The cursor starts
+        at a per-(rank, worker) offset so cold-start fetches fan out across
+        replicas instead of all hitting ``server_urls[0]``.
         """
         pid = os.getpid()
         if getattr(self, "_client_pid", None) != pid:
@@ -382,7 +384,14 @@ def _client(self) -> httpx.Client:
                 timeout=httpx.Timeout(self.config.request_timeout, connect=10.0)
             )
             self._client_pid = pid
-            self._rr = 0
+            # Stagger the initial cursor by (rank, worker) so cold-start fetches
+            # fan out instead of all pinning server_urls[0] (which can flood one
+            # cold replica past its execute-model timeout and kill the EngineCore).
+            info = torch.utils.data.get_worker_info()
+            worker_id = info.id if info is not None else 0
+            num_workers = info.num_workers if info is not None else 1
+            rank = int(os.environ.get("RANK", "0"))
+            self._rr = rank * num_workers + worker_id
         return self._http
 
     def _next_url(self) -> str:
diff --git a/tools/launcher/common/eagle3/train_eagle_streaming.sh b/tools/launcher/common/eagle3/train_eagle_streaming.sh
index 6f9636c459a..49b54709d35 100755
--- a/tools/launcher/common/eagle3/train_eagle_streaming.sh
+++ b/tools/launcher/common/eagle3/train_eagle_streaming.sh
@@ -236,7 +236,12 @@ run_trainer_and_export() {
     # custom-modeling base (e.g. Kimi) pass --trust_remote_code; empty by default.
     local out_dir
     out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1)
-    out_dir="${out_dir:-/scratchspace/eagle3}"
+    # Fail loud rather than guess a default: a wrong dir would silently export the
+    # wrong checkpoint. Every streaming yaml already forwards training.output_dir=.
+    if [ -z "$out_dir" ]; then
+        echo "ERROR: no training.output_dir= forwarded in SCRIPT_ARGS; cannot locate checkpoint to export." >&2
+        return 1
+    fi
     python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \
         --model_path "$out_dir" \
         --export_path "${EXPORT_PATH:-/scratchspace/export}" \

From 4c33ef57980170ea11dc39bce5e1b46f89da9128 Mon Sep 17 00:00:00 2001
From: h-guo18 <67671475+h-guo18@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:38:54 +0000
Subject: [PATCH 14/14] kimi answer only

Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
---
 examples/speculative_decoding/eagle_utils.py  |   4 +-
 modelopt/torch/speculative/eagle/utils.py     |  26 +++-
 .../plugins/hf_streaming_dataset.py           |  27 ++--
 modelopt/torch/utils/__init__.py              |   1 +
 modelopt/torch/utils/loss_mask.py             | 139 ++++++++++++++++++
 .../Kimi-K2.5/hf_streaming_dflash.yaml        |  10 +-
 .../hf_streaming_dflash_multi_node.yaml       |   5 +-
 7 files changed, 192 insertions(+), 20 deletions(-)
 create mode 100644 modelopt/torch/utils/loss_mask.py

diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
index f3ef93d740e..626ea786237 100644
--- a/examples/speculative_decoding/eagle_utils.py
+++ b/examples/speculative_decoding/eagle_utils.py
@@ -139,7 +139,9 @@ def make_speculative_data_module(
             raise ValueError("sample_size must be -1 (use all samples) or a positive integer")
         if data_args.sample_size > 0:
             dumped_files = dumped_files[: data_args.sample_size]
-        train_dataset = OfflineSupervisedDataset(dumped_files, answer_only_loss=answer_only_loss)
+        train_dataset = OfflineSupervisedDataset(
+            dumped_files, answer_only_loss=answer_only_loss, tokenizer=tokenizer
+        )
         data_collator = EagleOfflineDataCollator(train_len=train_len)
 
     return {
diff --git a/modelopt/torch/speculative/eagle/utils.py b/modelopt/torch/speculative/eagle/utils.py
index f74fcb1e9fb..2c536d04991 100644
--- a/modelopt/torch/speculative/eagle/utils.py
+++ b/modelopt/torch/speculative/eagle/utils.py
@@ -41,6 +41,8 @@
 from torch.utils.data import Dataset
 from transformers.trainer_pt_utils import LabelSmoother
 
+from modelopt.torch.utils.loss_mask import get_loss_mask_recovery
+
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 
 
@@ -96,20 +98,27 @@ class OfflineSupervisedDataset(Dataset):
         dumped_files (list): A list of file paths to the dumped .pt files.
         answer_only_loss (bool): If True, use the ``loss_mask`` stored in each .pt
             file so that only assistant-produced tokens contribute to the loss.
-            Raises ``ValueError`` on ``__getitem__`` if the file lacks ``loss_mask``.
+            If a file lacks ``loss_mask`` and ``tokenizer`` has a registered
+            model-specific recovery (see ``modelopt.torch.utils.loss_mask``), the
+            mask is rebuilt from ``input_ids``; otherwise ``__getitem__`` raises
+            ``ValueError``.
             If False (default), a uniform all-ones mask is used regardless of what
             is stored in the file (backward compatible).
+        tokenizer: Optional tokenizer used to recover the assistant mask for dumps
+            that lack a stored ``loss_mask``.
     """
 
     def __init__(
         self,
         dumped_files,
         answer_only_loss: bool = False,
+        tokenizer=None,
     ):
         """Initialize with a list of .pt file paths."""
         super().__init__()
         self.dumped_files = dumped_files
         self.answer_only_loss = answer_only_loss
+        self.tokenizer = tokenizer
 
     def __len__(self):
         return len(self.dumped_files)
@@ -121,13 +130,22 @@ def __getitem__(self, i) -> dict[str, torch.Tensor]:
         labels[..., :-1] = offline_data["input_ids"][..., 1:]
 
         if self.answer_only_loss:
-            if "loss_mask" not in offline_data:
+            recovery = get_loss_mask_recovery(self.tokenizer) if self.tokenizer else None
+            if "loss_mask" in offline_data:
+                loss_mask = offline_data["loss_mask"].to(offline_data["input_ids"].dtype)
+            elif recovery is not None:
+                # Dumps from tokenizers that cannot emit assistant masks carry no
+                # loss_mask; rebuild it from the token ids.
+                loss_mask = recovery.compute(self.tokenizer, offline_data["input_ids"]).to(
+                    offline_data["input_ids"].dtype
+                )
+            else:
                 raise ValueError(
                     f"answer_only_loss=True requires a 'loss_mask' entry in the offline "
                     f".pt file, but {self.dumped_files[i]} does not have one. Re-dump "
-                    f"with --answer-only-loss in compute_hidden_states_*.py."
+                    f"with --answer-only-loss in compute_hidden_states_*.py, or pass a "
+                    f"tokenizer with a registered loss-mask recovery."
                 )
-            loss_mask = offline_data["loss_mask"].to(offline_data["input_ids"].dtype)
         else:
             loss_mask = torch.ones_like(offline_data["input_ids"])
 
diff --git a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
index 1dd38b64154..c1be45e9e56 100644
--- a/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
+++ b/modelopt/torch/speculative/plugins/hf_streaming_dataset.py
@@ -61,6 +61,7 @@
 from transformers.trainer_pt_utils import LabelSmoother
 
 from modelopt.torch.utils import print_rank_0, warn_rank_0
+from modelopt.torch.utils.loss_mask import get_loss_mask_recovery
 
 __all__ = [
     "EagleFetchPayload",
@@ -100,31 +101,39 @@ def _tokenize_with_loss_mask(
     tags so the tokenizer can return ``assistant_masks``. When ``max_seq_len`` is set,
     truncation is delegated to the tokenizer so ids and assistant_masks are truncated
     in lockstep.
+
+    ``assistant_masks`` requires a fast tokenizer (it needs ``char_to_token``). For
+    tokenizers without it, the mask is rebuilt from token ids via a registered
+    model-specific recovery (see ``modelopt.torch.utils.loss_mask``) if one matches.
     """
+    recovery = None
+    if answer_only_loss and not getattr(tokenizer, "is_fast", False):
+        recovery = get_loss_mask_recovery(tokenizer)
     out = tokenizer.apply_chat_template(
         conversations,
         tokenize=True,
         return_tensors="pt",
         return_dict=True,
-        return_assistant_tokens_mask=answer_only_loss,
+        return_assistant_tokens_mask=answer_only_loss and recovery is None,
         add_generation_prompt=False,
         truncation=max_seq_len is not None,
         max_length=max_seq_len,
     )
     input_ids = out["input_ids"]
     seq_len = input_ids.shape[-1]
-    if answer_only_loss:
+    if not answer_only_loss:
+        loss_mask = torch.ones(seq_len, dtype=torch.long)
+    elif recovery is not None:
+        loss_mask = recovery.compute(tokenizer, input_ids[0])
+    else:
         mask = out["assistant_masks"]
         if not isinstance(mask, torch.Tensor):
             mask = torch.tensor(mask, dtype=torch.long)
         loss_mask = mask.squeeze(0).to(torch.long)
-        if loss_mask.shape[0] != seq_len:
-            raise RuntimeError(
-                f"assistant_masks length {loss_mask.shape[0]} does not match "
-                f"input_ids length {seq_len}"
-            )
-    else:
-        loss_mask = torch.ones(seq_len, dtype=torch.long)
+    if loss_mask.shape[0] != seq_len:
+        raise RuntimeError(
+            f"loss_mask length {loss_mask.shape[0]} does not match input_ids length {seq_len}"
+        )
     return input_ids, loss_mask
 
 
diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py
index 51d02248c14..a38c80cac01 100644
--- a/modelopt/torch/utils/__init__.py
+++ b/modelopt/torch/utils/__init__.py
@@ -22,6 +22,7 @@
 from .import_utils import *
 from .list import *
 from .logging import *
+from .loss_mask import *
 from .network import *
 from .perf import *
 from .regex import *
diff --git a/modelopt/torch/utils/loss_mask.py b/modelopt/torch/utils/loss_mask.py
new file mode 100644
index 00000000000..839bce24b8e
--- /dev/null
+++ b/modelopt/torch/utils/loss_mask.py
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model-specific recovery of the assistant loss mask.
+
+The standard way to build an answer-only loss mask is
+``apply_chat_template(..., return_assistant_tokens_mask=True)``, which maps the
+``{% generation %}`` template span to tokens via ``char_to_token`` -- and that is
+only available on "fast" tokenizers. Some models ship only a slow/Python tokenizer
+and cannot use this path.
+
+This module is a small registry of per-model fallbacks that recover the mask
+directly from token ids, keyed by a ``detect`` predicate. Data paths consult
+:func:`get_loss_mask_recovery` and stay free of any single model's chat-format
+details. It is intentionally minimal and is meant to seed a broader model-specific
+patch registry.
+"""
+
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import torch
+
+__all__ = ["LossMaskRecovery", "get_loss_mask_recovery", "register_loss_mask_recovery"]
+
+
+@dataclass(frozen=True)
+class LossMaskRecovery:
+    """A model-specific fallback for building the assistant loss mask.
+
+    Args:
+        name: Identifier for the target model family (for logging/debugging).
+        detect: Returns ``True`` if this recovery applies to the given tokenizer.
+        compute: Maps ``(tokenizer, input_ids)`` to a ``(seq_len,)`` ``LongTensor``
+            mask aligned to ``input_ids`` (1 on tokens that should contribute to
+            the loss, 0 otherwise).
+    """
+
+    name: str
+    detect: Callable[[object], bool]
+    compute: Callable[[object, torch.Tensor], torch.Tensor]
+
+
+_RECOVERIES: list[LossMaskRecovery] = []
+
+
+def register_loss_mask_recovery(recovery: LossMaskRecovery) -> None:
+    """Register a model-specific loss-mask recovery."""
+    _RECOVERIES.append(recovery)
+
+
+def get_loss_mask_recovery(tokenizer) -> LossMaskRecovery | None:
+    """Return the first registered recovery whose ``detect`` matches ``tokenizer``."""
+    for recovery in _RECOVERIES:
+        if recovery.detect(tokenizer):
+            return recovery
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Kimi
+#
+# Kimi ships only a Python (tiktoken) tokenizer, so it cannot emit assistant masks
+# via apply_chat_template. Its chat turns are rendered as
+#   <|im_{role}|> {role_name} <|im_middle|> {content} <|im_end|>
+# so the assistant content sits between <|im_middle|> and <|im_end|>.
+# ---------------------------------------------------------------------------
+
+_KIMI_ROLE_MARKERS = ("<|im_user|>", "<|im_assistant|>", "<|im_system|>")
+
+
+def _kimi_detect(tokenizer) -> bool:
+    """Whether ``tokenizer`` defines Kimi's chat role markers as real tokens."""
+    unk = getattr(tokenizer, "unk_token_id", None)
+    try:
+        ids = [
+            tokenizer.convert_tokens_to_ids(t)
+            for t in (*_KIMI_ROLE_MARKERS, "<|im_middle|>", "<|im_end|>")
+        ]
+    except Exception:
+        return False
+    return all(i is not None and i != unk for i in ids)
+
+
+def _kimi_compute(tokenizer, input_ids) -> torch.Tensor:
+    """Recover the assistant-content mask from already-tokenized Kimi chat ids.
+
+    Marks only the ``{content}`` span (between ``<|im_middle|>`` and ``<|im_end|>``,
+    both exclusive). This matches the ``{% generation %}`` span used for fast
+    tokenizers: the role header and the trailing ``<|im_end|>`` are not masked.
+    """
+    ids = input_ids.tolist() if hasattr(input_ids, "tolist") else list(input_ids)
+    assistant_id = tokenizer.convert_tokens_to_ids("<|im_assistant|>")
+    middle_id = tokenizer.convert_tokens_to_ids("<|im_middle|>")
+    end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    role_ids = {tokenizer.convert_tokens_to_ids(t) for t in _KIMI_ROLE_MARKERS}
+
+    n = len(ids)
+    mask = [0] * n
+    i = 0
+    while i < n:
+        if ids[i] != assistant_id:
+            i += 1
+            continue
+        # Skip the role header (role_name) up to its <|im_middle|> separator.
+        j = i + 1
+        while j < n and ids[j] != middle_id and ids[j] not in role_ids and ids[j] != end_id:
+            j += 1
+        if j >= n or ids[j] != middle_id:
+            # Malformed turn (no content separator) or a trailing generation prompt.
+            i = j
+            continue
+        # Mark the content span [middle + 1, end): excludes <|im_middle|> and <|im_end|>.
+        start = j + 1
+        k = start
+        while k < n and ids[k] != end_id and ids[k] not in role_ids:
+            k += 1
+        for t in range(start, k):
+            mask[t] = 1
+        i = k
+
+    return torch.tensor(mask, dtype=torch.long)
+
+
+register_loss_mask_recovery(
+    LossMaskRecovery(name="kimi", detect=_kimi_detect, compute=_kimi_compute)
+)
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
index 50475b1a28e..8f82b1919b1 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash.yaml
@@ -9,8 +9,10 @@
 # Capture ids = [2,16,31,45,59,60] (kimi_k25/deepseek_v3, 61 layers): 5 DFlash
 # target layers + base 60. n_captured = num_target_layers + 1.
 #
-# answer_only_loss forced false: Kimi's chat template lacks {% generation %} tags
-# needed to derive the assistant-token mask; flip on with a tagged template.
+# answer_only_loss=true: Kimi ships only a slow tokenizer, so it can't derive the
+# assistant mask the standard way (return_assistant_tokens_mask needs a fast
+# tokenizer's char_to_token). The mask is instead recovered from token ids by the
+# registered model-specific recovery in modelopt.torch.utils.loss_mask.
 #
 # Run ON the cluster login node (paramiko can't reach the cluster through its login proxy):
 #   export SLURM_HOST=localhost SLURM_ACCOUNT=<your_account> \
@@ -61,8 +63,8 @@ pipeline:
       - training.disable_tqdm=true
       - training.num_train_epochs=1
       - training.max_steps=3000
-      # Kimi's template lacks {% generation %} tags; train on all tokens (see header).
-      - training.answer_only_loss=false
+      # Assistant mask recovered from token ids for Kimi's slow tokenizer (see header).
+      - training.answer_only_loss=true
       # vLLM container has no tensorboard (dflash.yaml's default report_to); disable.
       - training.report_to=none
       # Kimi-K2.5 has no dedicated mask token; 163838 is a reserved slot used as one.
diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
index fedf729a038..6b70e94d262 100644
--- a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
+++ b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -62,8 +62,9 @@ pipeline:
       - training.ar_validate_steps=500000
       - training.num_train_epochs=1
       - training.max_steps=500
-      # Kimi's template lacks {% generation %} tags, so train on all tokens.
-      - training.answer_only_loss=false
+      # Kimi's slow tokenizer can't emit assistant masks the standard way; the mask
+      # is recovered from token ids (modelopt.torch.utils.loss_mask).
+      - training.answer_only_loss=true
       # vLLM container has no tensorboard (dflash.yaml's default) -> init crash.
       - training.report_to=none
       # Kimi has no dedicated mask token; 163838 is a reserved slot used as the mask.