diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 781bc38c2..f7af83754 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -75,6 +75,8 @@ If you prefer to build and run Docker containers manually: pip install -e . ``` +For scheduler-managed clusters, see the [SLURM guide](slurm.md). + ### 4. Apptainer/Singularity @@ -97,4 +99,5 @@ pip install -e . Once you have Iris running with any of these methods: - Explore the [Examples](../reference/examples.md) directory -- Learn about the [Programming Model](../conceptual/programming-model.md) \ No newline at end of file +- Learn about the [Programming Model](../conceptual/programming-model.md) +- For batch-scheduled environments, see [Running Iris on SLURM](slurm.md) diff --git a/docs/getting-started/slurm.md b/docs/getting-started/slurm.md new file mode 100644 index 000000000..b463da6c1 --- /dev/null +++ b/docs/getting-started/slurm.md @@ -0,0 +1,290 @@ +# Running Iris on SLURM + +This guide covers a practical Iris workflow on SLURM-managed GPU clusters. It is written to stay generic across clusters while matching the provided Iris scripts and working well on clusters where: + +- GPU nodes are scheduled with SLURM +- Docker is available on compute nodes, but not necessarily on login nodes +- fast local storage such as `/scratch` is preferred for builds and test output + +## What the provided SLURM script assumes + +The repository includes `scripts/run_core_tests_slurm.sh`, a batch wrapper for running `scripts/run_core_tests.sh`. + +It **assumes the container image already exists**. It does **not** build `iris-dev` for you. + +By default, the script: + +- requests 1 node with 4 GPUs +- expects a Docker image named `iris-dev` +- stages the repository into node-local storage when available +- installs Iris in editable mode inside the container +- runs `scripts/run_core_tests.sh` +- copies the per-test logs back to `$HOME/slurm-logs/iris-core-tests-/` + +If the image is missing, the job fails fast with an explicit error. + +## Fresh-clone workflow + +### 1. Clone the repository on shared storage + +Clone Iris somewhere visible from both the login node and the compute nodes. + +```bash +git clone https://github.com/ROCm/iris.git +cd iris +``` + +If your cluster provides both shared storage and node-local scratch, keep the source tree on shared storage and let jobs copy into scratch for execution. + +### 2. Request an interactive GPU allocation + +If Docker is only available on worker nodes, first allocate a node and enter it. + +```bash +salloc --nodes=1 --gres=gpu:4 --time=02:00:00 +srun --pty $SHELL +``` + +Adjust GPUs, walltime, partition, account, memory, and CPU count to match your site policy. + +### 3. Build the Iris Docker image on the allocated node + +```bash +cd /path/to/iris +./docker/build.sh +``` + +This builds the default image name, `iris-dev`. + +If you want a custom image name: + +```bash +./docker/build.sh my-iris-image +``` + +You can verify that the image exists with: + +```bash +docker image inspect iris-dev +``` + +### 4. Submit the batch job + +From the repository root: + +```bash +sbatch scripts/run_core_tests_slurm.sh +``` + +If you built a custom image: + +```bash +sbatch --export=ALL,IMAGE_NAME=my-iris-image scripts/run_core_tests_slurm.sh +``` + +## Important note about node-local images + +Some clusters store Docker images per node rather than in a shared registry-backed cache. In that setup, building `iris-dev` on one node does not guarantee that another node can see it. + +If your cluster behaves this way, either: + +1. build and submit on the same node, or +2. pin the batch job to the node where the image was built, or +3. rebuild the image on the target node + +For example, after building the image on a worker node: + +```bash +NODE_NAME=$(hostname) +sbatch -w "$NODE_NAME" scripts/run_core_tests_slurm.sh +``` + +If your cluster has shared container storage, you can usually omit `-w`. + +## Monitoring the job + +Use normal SLURM tools: + +```bash +squeue -j +sacct -j +``` + +By default, the batch script writes SLURM stdout/stderr to: + +```bash +iris_core_tests_.out +``` + +in the directory where `sbatch` was invoked. + +The per-test logs are copied to: + +```bash +$HOME/slurm-logs/iris-core-tests-/ +``` + +## Running interactively inside the container + +For development on an allocated node, you can also start the container manually: + +```bash +./docker/run.sh iris-dev "$(pwd)" +``` + +Then install Iris in editable mode: + +```bash +pip install -e ".[dev]" +``` + +This is useful when you want to debug failures before switching back to `sbatch`. + +## Running example programs under SLURM + +Many examples under `examples/` can be run directly with `python ... --num_ranks ` after Iris is installed in the container. + +The repository includes a generic example wrapper: + +```bash +scripts/run_example_slurm.sh +``` + +It stages the repository into node-local storage, installs Iris in the container, runs a chosen example script, and copies any `logs/` or `results/` directories back to: + +```bash +$HOME/slurm-logs/iris-example-/ +``` + +### Generic usage + +Submit any repo-relative example script and pass the example arguments after it: + +```bash +sbatch scripts/run_example_slurm.sh [example args...] +``` + +For example: + +```bash +sbatch scripts/run_example_slurm.sh examples/00_load/load_bench.py --num_ranks 4 +sbatch scripts/run_example_slurm.sh examples/13_flash_decode/example_run.py --num_ranks 4 +``` + +### Example: `examples/14_all_gather_gemm` + +This example directory provides both a pull-model and push-model entrypoint. + +Pull model: + +```bash +sbatch scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_pull.py \ + --num_ranks 4 +``` + +Push model: + +```bash +sbatch scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_push.py \ + --num_ranks 4 +``` + +If your image is node-local, build on a worker node first and optionally pin the submission to that node: + +```bash +NODE_NAME=$(hostname) +sbatch -w "$NODE_NAME" scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_pull.py \ + --num_ranks 4 +``` + +Use a rank count that matches the GPUs allocated to the job. + +### Custom image or install method + +```bash +sbatch --export=ALL,IMAGE_NAME=my-iris-image scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_pull.py \ + --num_ranks 4 +``` + +```bash +sbatch --export=ALL,INSTALL_METHOD=install scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_pull.py \ + --num_ranks 4 +``` + +## Customizing the provided batch wrapper + +The provided script is intentionally conservative and is meant for a 4-GPU core-test workflow. + +Common customizations: + +### Use a different image name + +```bash +sbatch --export=ALL,IMAGE_NAME=my-iris-image scripts/run_core_tests_slurm.sh +``` + +### Store copied logs elsewhere + +```bash +sbatch --export=ALL,PERSIST_LOG_ROOT=$HOME/my-iris-logs scripts/run_core_tests_slurm.sh +``` + +### Use a different scratch location + +If your cluster does not use `/scratch`, point the job at another fast workspace: + +```bash +sbatch --export=ALL,WORK_ROOT=/path/to/local/workdir scripts/run_core_tests_slurm.sh +``` + +### Change SLURM resources + +Either edit the `#SBATCH` lines in `scripts/run_core_tests_slurm.sh`, or override them at submission time: + +```bash +sbatch --gres=gpu:4 --time=04:00:00 --cpus-per-task=32 scripts/run_core_tests_slurm.sh +``` + +The current wrapper is designed around 4 GPUs. Since `scripts/run_core_tests.sh` includes 1, 2, 4, and 8-rank configurations, the wrapper automatically skips 8-rank cases when only 4 GPUs are visible. + +## Troubleshooting + +### `Docker image iris-dev not found` + +Build the image first: + +```bash +./docker/build.sh +``` + +If the image was built on another worker node, submit to that same node or rebuild locally. + +### `docker` is not available on the login node + +Request an interactive allocation and build from inside the worker node: + +```bash +salloc --nodes=1 --gres=gpu:4 --time=02:00:00 +srun --pty $SHELL +./docker/build.sh +``` + +### The job should run from fast local storage + +The provided wrapper already stages the repository into node-local storage when possible. If your cluster uses a different path than `/scratch`, set `WORK_ROOT` when submitting. + +### I need an Apptainer-based workflow instead + +Iris also includes Apptainer support: + +```bash +./apptainer/build.sh +./apptainer/run.sh +``` + +The provided `scripts/run_core_tests_slurm.sh` wrapper is Docker-based, so use the Apptainer scripts directly or create a cluster-specific batch wrapper around them. diff --git a/docs/index.md b/docs/index.md index 531520990..721e7b0ba 100644 --- a/docs/index.md +++ b/docs/index.md @@ -195,12 +195,13 @@ if __name__ == "__main__": For more examples, see the [Examples](reference/examples.md) page with ready-to-run scripts and usage patterns. -For other setup methods, see the [Installation Guide](getting-started/installation.md). +For other setup methods, see the [Installation Guide](getting-started/installation.md). For scheduler-managed clusters, see [Running Iris on SLURM](getting-started/slurm.md). ## Documentation Structure ### 📚 **Getting Started** - **[Installation](getting-started/installation.md)**: Set up Iris on your system + - **[SLURM](getting-started/slurm.md)**: Build and run Iris on scheduler-managed GPU clusters - **[Examples](reference/examples.md)**: Working code examples - **[Contributing](CONTRIBUTING.md)**: How to contribute @@ -243,4 +244,4 @@ Want to contribute to Iris? Check out the [Contributing Guide](CONTRIBUTING.md) --- -**Ready to start your multi-GPU journey? Begin with the [Installation Guide](getting-started/installation.md)!** +**Ready to start your multi-GPU journey? Begin with the [Installation Guide](getting-started/installation.md) or the [SLURM guide](getting-started/slurm.md)!** diff --git a/scripts/run_core_tests.sh b/scripts/run_core_tests.sh index 03d55a0d2..ee269c354 100755 --- a/scripts/run_core_tests.sh +++ b/scripts/run_core_tests.sh @@ -4,6 +4,43 @@ set -e +if command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="python3" +elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="python" +else + echo "Python interpreter not found" >&2 + exit 1 +fi + +count_visible_gpus() { + local visible_devices="$1" + + if [ -z "$visible_devices" ]; then + echo 0 + return + fi + + IFS=',' read -r -a devices <<< "$visible_devices" + echo "${#devices[@]}" +} + +MAX_NUM_RANKS=${IRIS_MAX_NUM_RANKS:-0} +if [ "$MAX_NUM_RANKS" -eq 0 ]; then + MAX_NUM_RANKS=$(count_visible_gpus "${ROCR_VISIBLE_DEVICES:-${CUDA_VISIBLE_DEVICES:-}}") +fi +if [ "$MAX_NUM_RANKS" -eq 0 ]; then + MAX_NUM_RANKS=$("$PYTHON_BIN" - <<'PY' +try: + import torch + + print(torch.cuda.device_count()) +except Exception: + print(0) +PY +) +fi + # Get timestamp for this run TIMESTAMP=$(date +%Y%m%d_%H%M%S) @@ -31,6 +68,9 @@ echo "========================================" echo "Timestamp: $TIMESTAMP" echo "Test directories: examples, unittests" echo "Rank configurations: 1, 2, 4, 8" +if [ "$MAX_NUM_RANKS" -gt 0 ]; then + echo "Visible GPU limit: $MAX_NUM_RANKS" +fi echo "Logs: $LOG_DIR/" echo " Main log: $MAIN_LOG" echo " Individual logs: ${LOG_DIR}/__rank*.log" @@ -41,6 +81,17 @@ echo "" # Run each test configuration for config in "${TEST_CONFIGS[@]}"; do IFS=',' read -r test_dir num_ranks <<< "$config" + + if [ "$MAX_NUM_RANKS" -gt 0 ] && [ "$num_ranks" -gt "$MAX_NUM_RANKS" ]; then + { + echo "" + echo "========================================" + echo "Skipping tests: $test_dir with $num_ranks ranks" + echo "Reason: visible GPU limit is $MAX_NUM_RANKS" + echo "========================================" + } | tee -a "$MAIN_LOG" + continue + fi { echo "" @@ -58,16 +109,12 @@ for config in "${TEST_CONFIGS[@]}"; do echo " Ranks: $num_ranks" echo " Logs: ${log_prefix}_rank*.log" - # Run the test and capture output per rank - # The run_tests_distributed.py spawns processes, so we need to modify it - # or use a wrapper. For now, let's run it and tee the output. - if [ "$num_ranks" -eq 1 ]; then # Single rank - direct log - python tests/run_tests_distributed.py --num_ranks $num_ranks "$test_file" -v --tb=short 2>&1 | tee "${log_prefix}_rank0.log" + "$PYTHON_BIN" tests/run_tests_distributed.py --num_ranks $num_ranks "$test_file" -v --tb=short 2>&1 | tee "${log_prefix}_rank0.log" else # Multi-rank - combined log - python tests/run_tests_distributed.py --num_ranks $num_ranks "$test_file" -v --tb=short 2>&1 | tee "${log_prefix}_all_ranks.log" + "$PYTHON_BIN" tests/run_tests_distributed.py --num_ranks $num_ranks "$test_file" -v --tb=short 2>&1 | tee "${log_prefix}_all_ranks.log" fi # Check exit code diff --git a/scripts/run_core_tests_slurm.sh b/scripts/run_core_tests_slurm.sh new file mode 100755 index 000000000..55814d3ed --- /dev/null +++ b/scripts/run_core_tests_slurm.sh @@ -0,0 +1,108 @@ +#!/bin/bash +#SBATCH --job-name=iris-core-tests +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=64G +#SBATCH --gres=gpu:4 +#SBATCH --time=06:00:00 +#SBATCH --output=iris_core_tests_%j.out + +set -euo pipefail + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +REPO_SRC=${REPO_SRC:-${SLURM_SUBMIT_DIR:-$(realpath "$SCRIPT_DIR/..")}} +IMAGE_NAME=${IMAGE_NAME:-iris-dev} +if [ -d "/scratch" ]; then + DEFAULT_WORK_PARENT="/scratch/$USER" +else + DEFAULT_WORK_PARENT="/tmp/$USER" +fi +WORK_ROOT=${WORK_ROOT:-$DEFAULT_WORK_PARENT/iris-core-tests-$SLURM_JOB_ID} +WORKSPACE_DIR="$WORK_ROOT/iris" +PERSIST_LOG_ROOT=${PERSIST_LOG_ROOT:-$HOME/slurm-logs/iris-core-tests-$SLURM_JOB_ID} +CONTAINER_NAME="${USER}-iris-core-tests-${SLURM_JOB_ID}" +CONTAINER_LABEL="user=${USER}" + +copy_logs_and_cleanup() { + local exit_code=$1 + + if [ -d "$WORKSPACE_DIR/logs" ]; then + mkdir -p "$PERSIST_LOG_ROOT" + if ! rsync -a "$WORKSPACE_DIR/logs/" "$PERSIST_LOG_ROOT/"; then + echo "Failed to copy logs to $PERSIST_LOG_ROOT" >&2 + if [ "$exit_code" -eq 0 ]; then + exit_code=1 + fi + fi + fi + + rm -rf "$WORK_ROOT" + exit "$exit_code" +} + +trap 'copy_logs_and_cleanup $?' EXIT + +mkdir -p "$WORK_ROOT" +rsync -a --delete \ + --exclude=".git/" \ + --exclude=".cache/" \ + --exclude=".pytest_cache/" \ + --exclude=".venv/" \ + --exclude="iris.egg-info/" \ + --exclude="logs/" \ + --exclude="results/" \ + "$REPO_SRC/" "$WORKSPACE_DIR/" + +cd "$WORKSPACE_DIR" + +echo "Repository source: $REPO_SRC" +echo "Scratch workspace: $WORKSPACE_DIR" +echo "Running on node: $(hostname)" +echo "Image name: $IMAGE_NAME" +echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" +echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES:-}" + +if ! command -v docker >/dev/null 2>&1; then + echo "docker is not available on $(hostname)" >&2 + exit 1 +fi + +if ! docker image inspect "$IMAGE_NAME" >/dev/null 2>&1; then + echo "Docker image $IMAGE_NAME not found on $(hostname)" >&2 + exit 1 +fi + +GPU_ENV_ARGS=() +if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then + GPU_ENV_ARGS+=(-e "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}") +fi +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + GPU_ENV_ARGS+=(-e "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}") + GPU_ENV_ARGS+=(-e "HIP_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}") +fi + +docker run --rm \ + --name "$CONTAINER_NAME" \ + --label "$CONTAINER_LABEL" \ + --network=host \ + --ipc=host \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size=16G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + "${GPU_ENV_ARGS[@]}" \ + -e HOME="$WORKSPACE_DIR" \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e IRIS_MAX_NUM_RANKS=4 \ + -v "$WORKSPACE_DIR:$WORKSPACE_DIR" \ + -w "$WORKSPACE_DIR" \ + --entrypoint bash \ + "$IMAGE_NAME" \ + -lc 'set -euo pipefail; git config --global --add safe.directory "$PWD"; python3 -m pip install -e ".[dev]"; bash scripts/run_core_tests.sh' + +echo "Logs copied to $PERSIST_LOG_ROOT" diff --git a/scripts/run_example_slurm.sh b/scripts/run_example_slurm.sh new file mode 100644 index 000000000..6e461f4b0 --- /dev/null +++ b/scripts/run_example_slurm.sh @@ -0,0 +1,177 @@ +#!/bin/bash +#SBATCH --job-name=iris-example +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=64G +#SBATCH --gres=gpu:4 +#SBATCH --time=02:00:00 +#SBATCH --output=iris_example_%j.out + +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: + sbatch scripts/run_example_slurm.sh [example args...] + +Examples: + sbatch scripts/run_example_slurm.sh examples/14_all_gather_gemm/example_run_pull.py --num_ranks 4 + sbatch --export=ALL,IMAGE_NAME=my-iris-image scripts/run_example_slurm.sh \ + examples/14_all_gather_gemm/example_run_push.py --num_ranks 4 --dtype bfloat16 + +Environment overrides: + IMAGE_NAME Docker image name (default: iris-dev) + INSTALL_METHOD editable | install | git (default: editable) + WORK_ROOT Node-local working directory for the staged repo + PERSIST_LOG_ROOT Directory where logs/results are copied after the job +EOF +} + +if [ $# -lt 1 ]; then + usage >&2 + exit 1 +fi + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +REPO_SRC=${REPO_SRC:-${SLURM_SUBMIT_DIR:-$(realpath "$SCRIPT_DIR/..")}} +IMAGE_NAME=${IMAGE_NAME:-iris-dev} +INSTALL_METHOD=${INSTALL_METHOD:-editable} +if [ -d "/scratch" ]; then + DEFAULT_WORK_PARENT="/scratch/$USER" +else + DEFAULT_WORK_PARENT="/tmp/$USER" +fi +WORK_ROOT=${WORK_ROOT:-$DEFAULT_WORK_PARENT/iris-example-$SLURM_JOB_ID} +WORKSPACE_DIR="$WORK_ROOT/iris" +PERSIST_LOG_ROOT=${PERSIST_LOG_ROOT:-$HOME/slurm-logs/iris-example-$SLURM_JOB_ID} +CONTAINER_NAME="${USER}-iris-example-${SLURM_JOB_ID}" +CONTAINER_LABEL="user=${USER}" + +EXAMPLE_SCRIPT_INPUT=$1 +shift +EXAMPLE_ARGS=("$@") + +if [[ "$EXAMPLE_SCRIPT_INPUT" = /* ]]; then + case "$EXAMPLE_SCRIPT_INPUT" in + "$REPO_SRC"/*) + EXAMPLE_SCRIPT=${EXAMPLE_SCRIPT_INPUT#"$REPO_SRC"/} + ;; + *) + echo "Example script must be inside the repository: $EXAMPLE_SCRIPT_INPUT" >&2 + exit 1 + ;; + esac +else + EXAMPLE_SCRIPT=$EXAMPLE_SCRIPT_INPUT +fi + +if [ ! -f "$REPO_SRC/$EXAMPLE_SCRIPT" ]; then + echo "Example script not found: $EXAMPLE_SCRIPT" >&2 + exit 1 +fi + +case "$INSTALL_METHOD" in + editable) + INSTALL_CMD='python3 -m pip install -e ".[dev]"' + ;; + install) + INSTALL_CMD='python3 -m pip install .' + ;; + git) + REPO=${GITHUB_REPOSITORY:-ROCm/iris} + SHA=${GITHUB_SHA:-HEAD} + INSTALL_CMD="python3 -m pip install git+https://github.com/${REPO}.git@${SHA}" + ;; + *) + echo "Unsupported INSTALL_METHOD: $INSTALL_METHOD" >&2 + exit 1 + ;; +esac + +printf -v EXAMPLE_SCRIPT_ESCAPED '%q' "$EXAMPLE_SCRIPT" +printf -v EXAMPLE_ARGS_ESCAPED '%q ' "${EXAMPLE_ARGS[@]}" + +copy_artifacts_and_cleanup() { + local exit_code=$1 + + mkdir -p "$PERSIST_LOG_ROOT" + if [ -d "$WORKSPACE_DIR/logs" ]; then + rsync -a "$WORKSPACE_DIR/logs/" "$PERSIST_LOG_ROOT/logs/" || exit_code=$? + fi + if [ -d "$WORKSPACE_DIR/results" ]; then + rsync -a "$WORKSPACE_DIR/results/" "$PERSIST_LOG_ROOT/results/" || exit_code=$? + fi + + rm -rf "$WORK_ROOT" + exit "$exit_code" +} + +trap 'copy_artifacts_and_cleanup $?' EXIT + +mkdir -p "$WORK_ROOT" +rsync -a --delete \ + --exclude=".git/" \ + --exclude=".cache/" \ + --exclude=".pytest_cache/" \ + --exclude=".venv/" \ + --exclude=".triton/" \ + --exclude="iris.egg-info/" \ + --exclude="logs/" \ + --exclude="results/" \ + "$REPO_SRC/" "$WORKSPACE_DIR/" + +cd "$WORKSPACE_DIR" + +echo "Repository source: $REPO_SRC" +echo "Scratch workspace: $WORKSPACE_DIR" +echo "Running on node: $(hostname)" +echo "Image name: $IMAGE_NAME" +echo "Example script: $EXAMPLE_SCRIPT" +echo "Example args: ${EXAMPLE_ARGS[*]:-}" +echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" +echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES:-}" + +if ! command -v docker >/dev/null 2>&1; then + echo "docker is not available on $(hostname)" >&2 + exit 1 +fi + +if ! docker image inspect "$IMAGE_NAME" >/dev/null 2>&1; then + echo "Docker image $IMAGE_NAME not found on $(hostname)" >&2 + exit 1 +fi + +GPU_ENV_ARGS=() +if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then + GPU_ENV_ARGS+=(-e "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}") +fi +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + GPU_ENV_ARGS+=(-e "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}") + GPU_ENV_ARGS+=(-e "HIP_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}") +fi + +docker run --rm \ + --user "$(id -u):$(id -g)" \ + --name "$CONTAINER_NAME" \ + --label "$CONTAINER_LABEL" \ + --network=host \ + --ipc=host \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size=16G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + "${GPU_ENV_ARGS[@]}" \ + -e HOME="$WORKSPACE_DIR" \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -v "$WORKSPACE_DIR:$WORKSPACE_DIR" \ + -w "$WORKSPACE_DIR" \ + --entrypoint bash \ + "$IMAGE_NAME" \ + -lc "set -euo pipefail; git config --global --add safe.directory \"\$PWD\"; $INSTALL_CMD; python3 $EXAMPLE_SCRIPT_ESCAPED $EXAMPLE_ARGS_ESCAPED" + +echo "Artifacts copied to $PERSIST_LOG_ROOT" diff --git a/tests/run_tests_distributed.py b/tests/run_tests_distributed.py old mode 100755 new mode 100644 index fc28b5da0..9c6970a34 --- a/tests/run_tests_distributed.py +++ b/tests/run_tests_distributed.py @@ -3,43 +3,91 @@ # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. """ -Worker script for running pytest tests under torchrun. -This script is invoked by torchrun and runs pytest within a distributed process group. +Launcher/worker script for running pytest tests under torchrun. + +Direct usage: + python tests/run_tests_distributed.py tests/unittests/ --num_ranks 4 -v + +Worker usage (invoked automatically by torchrun): + torchrun --nproc_per_node=4 tests/run_tests_distributed.py tests/unittests/ -v """ +from __future__ import annotations + +import argparse import os +import subprocess import sys +from pathlib import Path + + +def _running_under_torchrun() -> bool: + return "RANK" in os.environ and "WORLD_SIZE" in os.environ + + +def _parse_launcher_args(argv: list[str]) -> tuple[int, list[str]]: + parser = argparse.ArgumentParser(description="Run pytest tests under torchrun.") + parser.add_argument("--num_ranks", type=int, required=True, help="Number of torchrun processes to launch.") + args, pytest_args = parser.parse_known_args(argv) + + if args.num_ranks < 1: + parser.error("--num_ranks must be at least 1") + if not pytest_args: + parser.error("At least one pytest path or argument is required") -# Set required environment variable for RCCL on ROCm -os.environ.setdefault("HSA_NO_SCRATCH_RECLAIM", "1") + return args.num_ranks, pytest_args -import torch -import torch.distributed as dist -# torchrun sets these environment variables automatically -rank = int(os.environ.get("RANK", 0)) -world_size = int(os.environ.get("WORLD_SIZE", 1)) -local_rank = int(os.environ.get("LOCAL_RANK", 0)) +def _launch_torchrun(argv: list[str]) -> int: + num_ranks, pytest_args = _parse_launcher_args(argv) + script_path = str(Path(__file__).resolve()) + launch_cmd = [ + sys.executable, + "-m", + "torch.distributed.run", + "--rdzv-backend=c10d", + "--rdzv-endpoint=localhost:0", + "--nnodes=1", + f"--nproc_per_node={num_ranks}", + script_path, + *pytest_args, + ] + return subprocess.run(launch_cmd, check=False).returncode -# Set the correct GPU for this specific process -if torch.cuda.is_available(): - torch.cuda.set_device(local_rank) -# Initialize distributed - torchrun already set up the environment -dist.init_process_group( - backend="nccl", - rank=rank, - world_size=world_size, - device_id=torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else None, -) +def _run_pytest_worker(pytest_args: list[str]) -> int: + os.environ.setdefault("HSA_NO_SCRATCH_RECLAIM", "1") -try: - # Import and run pytest with command-line arguments import pytest + import torch + import torch.distributed as dist + + rank = int(os.environ.get("RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + + if torch.cuda.is_available(): + torch.cuda.set_device(local_rank) + + dist.init_process_group( + backend="nccl", + rank=rank, + world_size=world_size, + device_id=torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else None, + ) + + try: + return pytest.main(pytest_args) + finally: + if dist.is_initialized(): + dist.destroy_process_group() + + +def main() -> int: + if _running_under_torchrun(): + return _run_pytest_worker(sys.argv[1:]) + return _launch_torchrun(sys.argv[1:]) + - # Pass through all command-line arguments to pytest - exit_code = pytest.main(sys.argv[1:]) - sys.exit(exit_code) -finally: - if dist.is_initialized(): - dist.destroy_process_group() +if __name__ == "__main__": + raise SystemExit(main())