Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
beafa1c
Initial plan
Copilot Oct 19, 2025
3fa7d2b
Fix producer-consumer locking with atomic_add and proper synchronization
Copilot Oct 19, 2025
fd45d66
Fix synchronization to wait for world_size instead of world_size-1
Copilot Oct 19, 2025
cf9e546
Simplify consumer wait loop to avoid resetting tile_ready
Copilot Oct 19, 2025
8d7c7e1
Add memory fences to ensure proper visibility of local_C writes
Copilot Oct 19, 2025
5d1233f
Initial iris-ccl implementation.
neoblizz Nov 1, 2025
ad58f05
Merge branch 'main' into muhosama/ccl-init
neoblizz Nov 1, 2025
c3ee2a1
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
798aec3
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
9f6bbe5
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
422e313
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
30207a0
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
06dc446
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
2a18785
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
a245047
Apply suggestion from @Copilot
neoblizz Nov 3, 2025
efef873
All-to-all benchmark.py.
neoblizz Nov 3, 2025
c5dae7f
Make ccl a class of operators.
neoblizz Nov 4, 2025
3bc2441
Make-shift traffic shaping algorithm - works well with iris translate…
neoblizz Nov 5, 2025
9bfbde9
Better defaults + Gluon-based Iris All-to-all.
neoblizz Nov 6, 2025
ab6aa81
iris.ccl.all_reduce()
neoblizz Nov 7, 2025
1c92a40
...
neoblizz Nov 9, 2025
0a5aed4
Finalize AR algorithms variants.
neoblizz Nov 9, 2025
cecfafd
Apply suggestions from code review
neoblizz Nov 9, 2025
dcdd20a
Recast from_base/to_base using gl ptr.
neoblizz Nov 9, 2025
eb4b259
Enable ccl tests.
neoblizz Nov 10, 2025
2b2b11e
Fix all-reduce bw calc.
neoblizz Nov 10, 2025
3cebc56
Testing infra: cleanup after test.
neoblizz Nov 11, 2025
7461815
Fix tests.
neoblizz Nov 11, 2025
a4eb3d7
Increase testing timeout.
neoblizz Nov 11, 2025
07c4a98
Update iris-external-validation-test.yml [no ci]
neoblizz Nov 11, 2025
9626d78
Remove timeout [no ci]
neoblizz Nov 11, 2025
5626d0d
Update iris-tests-apptainer.yml
neoblizz Nov 11, 2025
c76d5e2
Lint.
neoblizz Nov 11, 2025
04e7fe4
More lint.
neoblizz Nov 11, 2025
e7cb2c3
Apply Ruff auto-fixes
github-actions[bot] Nov 11, 2025
9c58a8f
...
neoblizz Nov 12, 2025
866d6b9
More testing.
neoblizz Nov 12, 2025
124cc30
More fixes.
neoblizz Nov 12, 2025
376aa5f
More updates... please make it stop.
neoblizz Nov 12, 2025
2118c72
....
neoblizz Nov 12, 2025
ba75aef
Apply Ruff auto-fixes
github-actions[bot] Nov 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 21 additions & 13 deletions .github/scripts/container_exec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Universal container exec script - thin wrapper that executes commands in either Apptainer or Docker
# Usage: container_exec.sh [--gpus GPUS] [--image IMAGE] <command>

set -e

# Parse optional arguments
GPU_DEVICES=""
Expand All @@ -30,8 +29,8 @@ done
# Remaining args are the command
COMMAND="$@"
if [ -z "$COMMAND" ]; then
echo "[ERROR] No command provided"
echo "Usage: $0 [--gpus GPUS] [--image IMAGE] <command>"
echo "[ERROR] No command provided" >&2
echo "Usage: $0 [--gpus GPUS] [--image IMAGE] <command>" >&2
exit 1
fi

Expand All @@ -43,7 +42,7 @@ elif command -v docker &> /dev/null; then
CONTAINER_RUNTIME="docker"
echo "[INFO] Using Docker"
else
echo "[ERROR] Neither Apptainer nor Docker is available"
echo "[ERROR] Neither Apptainer nor Docker is available" >&2
exit 1
fi

Expand All @@ -57,13 +56,16 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
elif [ -f apptainer/images/iris.sif ]; then
IMAGE="apptainer/images/iris.sif"
else
echo "[ERROR] Apptainer image not found"
echo "[ERROR] Apptainer image not found" >&2
exit 1
fi

# Create temporary overlay in workspace
OVERLAY="./iris_overlay_$(date +%s%N).img"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}" > /dev/null 2>&1
# Create temporary overlay in workspace with unique name based on PID and timestamp
OVERLAY="./iris_overlay_$$_$(date +%s%N).img"
if ! apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}" > /dev/null 2>&1; then
echo "[ERROR] Failed to create Apptainer overlay"
exit 1
fi

# Build exec command
EXEC_CMD="apptainer exec --overlay ${OVERLAY} --no-home --cleanenv"
Expand All @@ -76,14 +78,18 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
# Add standard flags
EXEC_CMD="$EXEC_CMD --bind ${PWD}:/iris_workspace --cwd /iris_workspace"

# Execute
$EXEC_CMD "$IMAGE" bash -c "$COMMAND"
# Execute with cleanup of overlay file
EXIT_CODE=0
$EXEC_CMD "$IMAGE" bash -c "$COMMAND" || EXIT_CODE=$?
# Clean up overlay file (always cleanup, even on failure)
rm -f "${OVERLAY}" 2>/dev/null || true
exit $EXIT_CODE

elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev-triton-aafec41"}}

if ! docker image inspect "$IMAGE_NAME" &> /dev/null; then
echo "[ERROR] Docker image $IMAGE_NAME not found"
echo "[ERROR] Docker image $IMAGE_NAME not found" >&2
exit 1
fi

Expand Down Expand Up @@ -114,7 +120,9 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
RUN_CMD="$RUN_CMD -e HIP_VISIBLE_DEVICES=${GPU_DEVICES}"
fi

# Execute
$RUN_CMD "$IMAGE_NAME" -c "$COMMAND"
# Execute and capture exit code
EXIT_CODE=0
$RUN_CMD "$IMAGE_NAME" -c "$COMMAND" || EXIT_CODE=$?
exit $EXIT_CODE
fi

6 changes: 6 additions & 0 deletions .github/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,11 @@ fi
echo \"Testing: \$test_file with $NUM_RANKS ranks\"
python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
done

# Run ccl tests
for test_file in tests/ccl/test_*.py; do
echo \"Testing: \$test_file with $NUM_RANKS ranks\"
python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
done
"

5 changes: 1 addition & 4 deletions .github/workflows/iris-external-validation-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ concurrency:
jobs:
build-container-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90

steps:
- name: Checkout repository
Expand All @@ -40,7 +39,6 @@ jobs:
name: External Validation Test
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30

steps:
- name: Checkout repository
Expand Down Expand Up @@ -69,7 +67,6 @@ jobs:
name: External Gluon Validation Test
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30

steps:
- name: Checkout repository
Expand All @@ -92,4 +89,4 @@ jobs:
"
echo "::endgroup::"

echo "✅ External gluon validation test passed!"
echo "✅ External gluon validation test passed!"
67 changes: 49 additions & 18 deletions .github/workflows/iris-pip-install-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ concurrency:
jobs:
build-container-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90

steps:
- name: Checkout repository
Expand All @@ -40,7 +39,6 @@ jobs:
name: Pip Install Test 1/2/4 Ranks (Parallel)
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30

steps:
- name: Checkout repository
Expand All @@ -54,8 +52,7 @@ jobs:

- name: Run pip install tests for 1, 2, 4 ranks in parallel
run: |
set -e

# Don't use set -e here - we want to handle errors manually for parallel processes
# Run tests in parallel with different GPU assignments
# Note: Each test gets 2+ GPUs even if it only uses some of them.
# This allows tests like test_empty_device_handling to verify that
Expand All @@ -75,7 +72,7 @@ jobs:
echo \"Testing: \$test_file with 1 ranks\"
python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10
done
" &
" > /tmp/test_1rank.log 2>&1 &
PID1=$!

echo "Starting 2-rank test on GPUs 2,3..."
Expand All @@ -91,7 +88,7 @@ jobs:
echo \"Testing: \$test_file with 2 ranks\"
python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10
done
" &
" > /tmp/test_2rank.log 2>&1 &
PID2=$!

echo "Starting 4-rank test on GPUs 4,5,6,7..."
Expand All @@ -107,7 +104,7 @@ jobs:
echo \"Testing: \$test_file with 4 ranks\"
python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10
done
" &
" > /tmp/test_4rank.log 2>&1 &
PID4=$!
echo "::endgroup::"

Expand All @@ -116,11 +113,44 @@ jobs:
FAIL=0
FAILED_TESTS=""

wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; }
wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; }
wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; }
# Wait for each process and capture exit status
if ! wait $PID1; then
echo "::error::1-rank test FAILED"
echo "::group::1-rank test logs"
cat /tmp/test_1rank.log || true
echo "::endgroup::"
FAILED_TESTS="$FAILED_TESTS 1-rank"
FAIL=1
else
echo "✅ 1-rank test passed"
fi

if ! wait $PID2; then
echo "::error::2-rank test FAILED"
echo "::group::2-rank test logs"
cat /tmp/test_2rank.log || true
echo "::endgroup::"
FAILED_TESTS="$FAILED_TESTS 2-rank"
FAIL=1
else
echo "✅ 2-rank test passed"
fi

if ! wait $PID4; then
echo "::error::4-rank test FAILED"
echo "::group::4-rank test logs"
cat /tmp/test_4rank.log || true
echo "::endgroup::"
FAILED_TESTS="$FAILED_TESTS 4-rank"
FAIL=1
else
echo "✅ 4-rank test passed"
fi
echo "::endgroup::"

# Clean up log files
rm -f /tmp/test_1rank.log /tmp/test_2rank.log /tmp/test_4rank.log

if [ $FAIL -eq 1 ]; then
echo "::error::Parallel tests failed:$FAILED_TESTS"
exit 1
Expand All @@ -132,7 +162,6 @@ jobs:
name: Pip Install Test 8 Ranks
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30

steps:
- name: Checkout repository
Expand All @@ -146,10 +175,8 @@ jobs:

- name: Run 8-rank pip install test
run: |
set -e

echo "::group::Running 8-rank test on all GPUs"
bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" "
if bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" "
set -e
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
pip install -e .
Expand All @@ -161,7 +188,11 @@ jobs:
echo \"Testing: \$test_file with 8 ranks\"
python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10
done
"
echo "::endgroup::"

echo "✅ 8-rank test passed!"
"; then
echo "::endgroup::"
echo "✅ 8-rank test passed!"
else
echo "::endgroup::"
echo "::error::8-rank test FAILED"
exit 1
fi
3 changes: 0 additions & 3 deletions .github/workflows/iris-tests-apptainer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ concurrency:
jobs:
build-container-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90

steps:
- name: Checkout repository
Expand All @@ -39,7 +38,6 @@ jobs:
name: Test 1/2/4 Ranks (Parallel)
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 20

steps:
- name: Checkout repository
Expand Down Expand Up @@ -93,7 +91,6 @@ jobs:
name: Test 8 Ranks
needs: build-container-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 15

steps:
- name: Checkout repository
Expand Down
Loading