ROCm · neoblizz · Nov 12, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
@@ -5,7 +5,6 @@
 # Universal container exec script - thin wrapper that executes commands in either Apptainer or Docker
 # Usage: container_exec.sh [--gpus GPUS] [--image IMAGE] <command>
 
-set -e
 
 # Parse optional arguments
 GPU_DEVICES=""
@@ -30,8 +29,8 @@ done
 # Remaining args are the command
 COMMAND="$@"
 if [ -z "$COMMAND" ]; then
-    echo "[ERROR] No command provided"
-    echo "Usage: $0 [--gpus GPUS] [--image IMAGE] <command>"
+    echo "[ERROR] No command provided" >&2
+    echo "Usage: $0 [--gpus GPUS] [--image IMAGE] <command>" >&2
     exit 1
 fi
 
@@ -43,7 +42,7 @@ elif command -v docker &> /dev/null; then
     CONTAINER_RUNTIME="docker"
     echo "[INFO] Using Docker"
 else
-    echo "[ERROR] Neither Apptainer nor Docker is available"
+    echo "[ERROR] Neither Apptainer nor Docker is available" >&2
     exit 1
 fi
 
@@ -57,13 +56,16 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
     elif [ -f apptainer/images/iris.sif ]; then
         IMAGE="apptainer/images/iris.sif"
     else
-        echo "[ERROR] Apptainer image not found"
+        echo "[ERROR] Apptainer image not found" >&2
         exit 1
     fi
 
-    # Create temporary overlay in workspace
-    OVERLAY="./iris_overlay_$(date +%s%N).img"
-    apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}" > /dev/null 2>&1
+    # Create temporary overlay in workspace with unique name based on PID and timestamp
+    OVERLAY="./iris_overlay_$$_$(date +%s%N).img"
+    if ! apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}" > /dev/null 2>&1; then
+        echo "[ERROR] Failed to create Apptainer overlay"
+        exit 1
+    fi
 
     # Build exec command
     EXEC_CMD="apptainer exec --overlay ${OVERLAY} --no-home --cleanenv"
@@ -76,14 +78,18 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
     # Add standard flags
     EXEC_CMD="$EXEC_CMD --bind ${PWD}:/iris_workspace --cwd /iris_workspace"
 
-    # Execute
-    $EXEC_CMD "$IMAGE" bash -c "$COMMAND"
+    # Execute with cleanup of overlay file
+    EXIT_CODE=0
+    $EXEC_CMD "$IMAGE" bash -c "$COMMAND" || EXIT_CODE=$?
+    # Clean up overlay file (always cleanup, even on failure)
+    rm -f "${OVERLAY}" 2>/dev/null || true
+    exit $EXIT_CODE
 
 elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
     IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev-triton-aafec41"}}
 
     if ! docker image inspect "$IMAGE_NAME" &> /dev/null; then
-        echo "[ERROR] Docker image $IMAGE_NAME not found"
+        echo "[ERROR] Docker image $IMAGE_NAME not found" >&2
         exit 1
     fi
 
@@ -114,7 +120,9 @@ elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
         RUN_CMD="$RUN_CMD -e HIP_VISIBLE_DEVICES=${GPU_DEVICES}"
     fi
 
-    # Execute
-    $RUN_CMD "$IMAGE_NAME" -c "$COMMAND"
+    # Execute and capture exit code
+    EXIT_CODE=0
+    $RUN_CMD "$IMAGE_NAME" -c "$COMMAND" || EXIT_CODE=$?
+    exit $EXIT_CODE
 fi
 
@@ -40,5 +40,11 @@ fi
         echo \"Testing: \$test_file with $NUM_RANKS ranks\"
         python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
     done
+
+    # Run ccl tests
+    for test_file in tests/ccl/test_*.py; do
+        echo \"Testing: \$test_file with $NUM_RANKS ranks\"
+        python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
+    done
 "
 
@@ -14,7 +14,6 @@ concurrency:
 jobs:
   build-container-image:
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 90
 
     steps:
       - name: Checkout repository
@@ -40,7 +39,6 @@ jobs:
     name: External Validation Test
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 30
 
     steps:
       - name: Checkout repository
@@ -69,7 +67,6 @@ jobs:
     name: External Gluon Validation Test
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 30
 
     steps:
       - name: Checkout repository
@@ -92,4 +89,4 @@ jobs:
           "
           echo "::endgroup::"
 
-          echo "✅ External gluon validation test passed!"
+          echo "✅ External gluon validation test passed!"
@@ -14,7 +14,6 @@ concurrency:
 jobs:
   build-container-image:
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 90
 
     steps:
       - name: Checkout repository
@@ -40,7 +39,6 @@ jobs:
     name: Pip Install Test 1/2/4 Ranks (Parallel)
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 30
 
     steps:
       - name: Checkout repository
@@ -54,8 +52,7 @@ jobs:
 
       - name: Run pip install tests for 1, 2, 4 ranks in parallel
         run: |
-          set -e
-
+          # Don't use set -e here - we want to handle errors manually for parallel processes
           # Run tests in parallel with different GPU assignments
           # Note: Each test gets 2+ GPUs even if it only uses some of them.
           # This allows tests like test_empty_device_handling to verify that
@@ -75,7 +72,7 @@ jobs:
               echo \"Testing: \$test_file with 1 ranks\"
               python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10
             done
-          " &
+          " > /tmp/test_1rank.log 2>&1 &
           PID1=$!
 
           echo "Starting 2-rank test on GPUs 2,3..."
@@ -91,7 +88,7 @@ jobs:
               echo \"Testing: \$test_file with 2 ranks\"
               python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10
             done
-          " &
+          " > /tmp/test_2rank.log 2>&1 &
           PID2=$!
 
           echo "Starting 4-rank test on GPUs 4,5,6,7..."
@@ -107,7 +104,7 @@ jobs:
               echo \"Testing: \$test_file with 4 ranks\"
               python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10
             done
-          " &
+          " > /tmp/test_4rank.log 2>&1 &
           PID4=$!
           echo "::endgroup::"
 
@@ -116,11 +113,44 @@ jobs:
           FAIL=0
           FAILED_TESTS=""
 
-          wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; }
-          wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; }
-          wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; }
+          # Wait for each process and capture exit status
+          if ! wait $PID1; then
+            echo "::error::1-rank test FAILED"
+            echo "::group::1-rank test logs"
+            cat /tmp/test_1rank.log || true
+            echo "::endgroup::"
+            FAILED_TESTS="$FAILED_TESTS 1-rank"
+            FAIL=1
+          else
+            echo "✅ 1-rank test passed"
+          fi
+
+          if ! wait $PID2; then
+            echo "::error::2-rank test FAILED"
+            echo "::group::2-rank test logs"
+            cat /tmp/test_2rank.log || true
+            echo "::endgroup::"
+            FAILED_TESTS="$FAILED_TESTS 2-rank"
+            FAIL=1
+          else
+            echo "✅ 2-rank test passed"
+          fi
+
+          if ! wait $PID4; then
+            echo "::error::4-rank test FAILED"
+            echo "::group::4-rank test logs"
+            cat /tmp/test_4rank.log || true
+            echo "::endgroup::"
+            FAILED_TESTS="$FAILED_TESTS 4-rank"
+            FAIL=1
+          else
+            echo "✅ 4-rank test passed"
+          fi
           echo "::endgroup::"
 
+          # Clean up log files
+          rm -f /tmp/test_1rank.log /tmp/test_2rank.log /tmp/test_4rank.log
+
           if [ $FAIL -eq 1 ]; then
             echo "::error::Parallel tests failed:$FAILED_TESTS"
             exit 1
@@ -132,7 +162,6 @@ jobs:
     name: Pip Install Test 8 Ranks
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 30
 
     steps:
       - name: Checkout repository
@@ -146,10 +175,8 @@ jobs:
 
       - name: Run 8-rank pip install test
         run: |
-          set -e
-
           echo "::group::Running 8-rank test on all GPUs"
-          bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" "
+          if bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" "
             set -e
             pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
             pip install -e .
@@ -161,7 +188,11 @@ jobs:
               echo \"Testing: \$test_file with 8 ranks\"
               python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10
             done
-          "
-          echo "::endgroup::"
-
-          echo "✅ 8-rank test passed!"
+          "; then
+            echo "::endgroup::"
+            echo "✅ 8-rank test passed!"
+          else
+            echo "::endgroup::"
+            echo "::error::8-rank test FAILED"
+            exit 1
+          fi
@@ -14,7 +14,6 @@ concurrency:
 jobs:
   build-container-image:
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 90
 
     steps:
       - name: Checkout repository
@@ -39,7 +38,6 @@ jobs:
     name: Test 1/2/4 Ranks (Parallel)
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 20
 
     steps:
       - name: Checkout repository
@@ -93,7 +91,6 @@ jobs:
     name: Test 8 Ranks
     needs: build-container-image
     runs-on: [self-hosted, mi3008x]
-    timeout-minutes: 15
 
     steps:
       - name: Checkout repository