Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/
*.egg-info/

# Virtual environments
.venv/
.test_venv/
.cvs_venv/
.ruff_venv/
Expand Down
36 changes: 36 additions & 0 deletions cvs/input/config_file/aorta/aorta_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,42 @@ analysis:
gemm_script: scripts/gemm_analysis/run_tracelens_analysis.sh
skip_if_exists: false

# Multi-node disaggregated launch.
#
# When the cluster file has more than one node entry, the runner launches a
# `torchrun` rank-group on every node in parallel (one per node), all
# rendezvous-ing on the head node. Mirrors aorta's own
# scripts/multi_node/local_launch.sh pattern, so a single cluster.json with N
# host entries is enough -- you no longer need N cluster files.
#
# `master_launch_mode: auto` keeps current single-node behavior (delegates to
# experiment_script) for 1-node clusters and switches to disaggregated
# torchrun for >1-node clusters. Force one or the other with `script` /
# `torchrun` if you want to override the auto-detection.
#
# When master_launch_mode resolves to torchrun, `experiment_script` is NOT
# used -- the runner builds:
# torchrun --nnodes <N> --node_rank <R> --nproc_per_node <P>
# --master_addr <head> --master_port <PORT>
# <container_mount_path>/<train_script>
# --config <container_mount_path>/<base_config>
# [--override training_overrides...]
#
# extra_env is exported inside each container before torchrun -- use it for
# transport-specific knobs that depend on the cluster (NCCL_SOCKET_IFNAME,
# NCCL_IB_HCA, NCCL_IB_GID_INDEX, ...). On a single ethernet network you can
# usually leave it empty.
multi_node:
master_launch_mode: auto
# nproc_per_node: 8 # defaults to gpus_per_node
# master_port: 29500 # default: pick a free ephemeral port
# master_addr: 10.0.0.1 # default: head node from cluster.json
train_script: train.py
extra_torchrun_args: []
extra_train_args: []
extra_env: {}
collect_traces: true

# Expected results: default thresholds for gfx942 (e.g. MI300). Change these as per your
# testing config (GPU, node count, workload); see docs/reference/configuration-files/aorta.rst.
# Tuned for host raw-trace parsing; use stricter values (e.g. min_compute_ratio 0.5+) with TraceLens Excel reports.
Expand Down
105 changes: 105 additions & 0 deletions cvs/parsers/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,96 @@ class AortaAnalysisConfigFile(BaseModel):
)


class AortaMultiNodeConfigFile(BaseModel):
"""
Schema for the optional ``multi_node`` section in aorta_benchmark.yaml.

When the cluster file contains more than one node, the runner launches a
disaggregated ``torchrun`` invocation on every node (one per ``node_dict``
entry), rendezvous-ing on the head node. This block tunes that path.

Single-node clusters ignore this block; ``master_launch_mode`` defaults to
``auto`` which means: ``script`` (current behavior, single-node) when the
cluster has one node, ``torchrun`` (multi-node disaggregated) when it has
more than one.
"""

model_config = ConfigDict(extra="forbid")

master_launch_mode: str = Field(
default="auto",
description=(
"How the experiment is launched on each node. 'auto' picks 'script' "
"for single-node clusters and 'torchrun' for multi-node clusters. "
"'script' always uses the configured experiment_script (single-node only). "
"'torchrun' always builds a multi-node torchrun command and ignores "
"experiment_script."
),
)
nproc_per_node: Optional[int] = Field(
default=None,
ge=1,
description=(
"Processes (GPUs) per node passed to torchrun --nproc_per_node. "
"Defaults to the top-level gpus_per_node when unset."
),
)
master_port: Optional[int] = Field(
default=None,
ge=1024,
le=65535,
description=(
"Port used for torchrun rendezvous (--master_port). When unset the "
"runner picks a free ephemeral port on the head node."
),
)
master_addr: Optional[str] = Field(
default=None,
description=(
"Override the master address (--master_addr). Defaults to the head node hostname/IP from the cluster file."
),
)
train_script: str = Field(
default="train.py",
description=(
"Path to the Aorta training entry script relative to aorta_path. "
"Used when master_launch_mode resolves to 'torchrun'."
),
)
extra_torchrun_args: List[str] = Field(
default_factory=list,
description="Additional CLI flags appended to the torchrun command.",
)
extra_train_args: List[str] = Field(
default_factory=list,
description="Additional CLI flags appended to train.py after --config.",
)
extra_env: Dict[str, str] = Field(
default_factory=dict,
description=(
"Extra environment variables to export inside the container before "
"torchrun. Useful for NCCL_SOCKET_IFNAME, NCCL_IB_HCA, "
"NCCL_IB_GID_INDEX, and similar transport-tuning knobs."
),
)
collect_traces: bool = Field(
default=True,
description=(
"When true, copy each node's torch_profiler artifacts back to the "
"head node under <aorta_path>/combined_traces/node_<rank>/ so the "
"host parsers see one unified trace tree."
),
)

@field_validator('master_launch_mode')
@classmethod
def validate_launch_mode(cls, v: str) -> str:
allowed = {"auto", "script", "torchrun"}
if v not in allowed:
raise ValueError(f"master_launch_mode must be one of {sorted(allowed)}, got {v!r}")
return v


class AortaBenchmarkConfigFile(BaseModel):
"""
Schema for the entire aorta_benchmark.yaml configuration file.
Expand Down Expand Up @@ -444,6 +534,16 @@ class AortaBenchmarkConfigFile(BaseModel):
default_factory=AortaAnalysisConfigFile, description="Post-benchmark analysis configuration"
)

# Multi-node disaggregated launch (one container + torchrun rank per node)
multi_node: AortaMultiNodeConfigFile = Field(
default_factory=AortaMultiNodeConfigFile,
description=(
"Multi-node launch configuration. Used when the cluster file lists "
"more than one node; ignored for single-node clusters unless "
"master_launch_mode is forced to 'torchrun'."
),
)

@field_validator('aorta_path')
@classmethod
def validate_aorta_path_not_placeholder(cls, v: str) -> str:
Expand Down Expand Up @@ -483,6 +583,11 @@ def validate_paths_exist(self) -> List[str]:
if not exp_script.exists():
errors.append(f"experiment_script does not exist: {exp_script}")

if self.multi_node.master_launch_mode == "torchrun":
train_script_path = aorta / self.multi_node.train_script
if not train_script_path.exists():
errors.append(f"multi_node.train_script does not exist: {train_script_path}")

# Check analysis scripts if enabled
if self.analysis.enable_tracelens:
tracelens_script = aorta / self.analysis.tracelens_script
Expand Down
Loading