intel · zxd1997066 · Sep 19, 2025 · Sep 25, 2025 · Oct 12, 2025 · Nov 2, 2025
diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml
@@ -93,18 +93,22 @@ runs:
             printf(" numactl -l ");
           }
         }')"
-        pytest_extra_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
-          split(z, xpu_list, ",");
-          if (length(xpu_list) > 1) {
-            for (i=0;i<length(xpu_list);i++) {
-              ze = xpu_list[i+1];
-              printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
-                      ze, cx, i*cx, (i+1)*cx-1);
+        if [ "${{ inputs.ut_name }}" == "xpu_distributed" ];then
+          pytest_extra_args="$(python ${{ github.workspace }}/.github/scripts/check-topology.py)"
+        else
+          pytest_extra_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
+            split(z, xpu_list, ",");
+            if (length(xpu_list) > 1) {
+              for (i=0;i<length(xpu_list);i++) {
+                ze = xpu_list[i+1];
+                printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
+                        ze, cx, i*cx, (i+1)*cx-1);
+              }
+            }else {
+              printf(" -n 1 ");
             }
-          }else {
-            printf(" -n 1 ");
-          }
-        }')"
+          }')"
+        fi
         echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT}
         echo "numactl_args=${numactl_args}" |tee -a ${GITHUB_OUTPUT}
         echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}

diff --git a/.github/scripts/check-topology.py b/.github/scripts/check-topology.py
@@ -0,0 +1,64 @@
+import os
+import sys
+
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log > /dev/null")
+if ret == 0:
+    gpu_dict = {}
+    cpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                cpu_affinity = items[-1].split(",")[0]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+                cpu_dict[i] = cpu_affinity
+
+    value_to_keys = {}
+    gpu_cpu_dict = {}
+    for key, value in gpu_dict.items():
+        if value not in value_to_keys:
+            value_to_keys[value] = []
+        value_to_keys[value].append(key)
+    dist_group = []
+    for key, value in value_to_keys.items():
+        if key == ','.join(value_to_keys[key]):
+            dist_group.append(key)
+    for group in dist_group:
+        cpu_aff = []
+        for i in group.split(","):
+            if cpu_dict[i] not in cpu_aff:
+                cpu_aff.append(cpu_dict[i])
+        if len(cpu_aff) == 1:
+            gpu_cpu_dict[group] = ','.join(cpu_aff)
+    if len(gpu_cpu_dict) == 0:
+        print("No Xelink detected")
+        sys.exit(255)
+    pytest_extra_args = ""
+    for key, value in gpu_cpu_dict.items():
+        start_cpu = int(value.split("-")[0])
+        end_cpu = int(value.split("-")[1])
+        threads = end_cpu - start_cpu + 1
+        pytest_extra_args = pytest_extra_args + \
+            ' --tx popen//env:ZE_AFFINITY_MASK=%s//env:OMP_NUM_THREADS=%d//python="numactl -l -C %s python"'\
+            %(key, threads, value)
+    print(pytest_extra_args)
+
+else:
+    print("xpu-smi topology failed")
+
+    sys.exit(255)
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
@@ -114,7 +114,7 @@ jobs:
             TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
           fi
           # gcc 11
-          source /opt/rh/gcc-toolset-11/enable
+          # source /opt/rh/gcc-toolset-11/enable
           source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
           ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
             --WORKSPACE="${{ github.workspace }}" \

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -51,6 +51,8 @@ jobs:
         uses: actions/checkout@v4
       - name: Get runner
         id: runner-info
+        with:
+          ut_name: ${{ inputs.ut }}
         uses: ./.github/actions/get-runner
 
   test-in-container:
@@ -102,7 +104,7 @@ jobs:
     runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
+      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }}
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -128,7 +128,7 @@ jobs:
         ut_name: [xpu_distributed]
     uses: ./.github/workflows/_linux_ut.yml
     with:
-      runner: pvc_rolling
+      runner: PVC-7900
       pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
       torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
       ut: ${{ matrix.ut_name }}

diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 import sys
 
@@ -9,42 +8,6 @@
 res2 = 0
 fail_test = []
 
-# Get the xelink group card affinity
-ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
-if ret == 0:
-    gpu_dict = {}
-    with open("topology.log") as file:
-        lines = file.readlines()
-        for line in lines:
-            if "CPU Affinity" in line:
-                continue
-            line = line.strip()
-            if line.startswith("GPU "):
-                items = line.split(" ")
-                items = [x for x in items if x]
-                gpu_id = items[1]
-                i = gpu_id.split("/")[0]
-                affinity = ""
-                for j, item in enumerate(items):
-                    if "SYS" not in item and ("XL" in item or "S" in item):
-                        if len(affinity) == 0:
-                            affinity = str(j - 2)
-                        else:
-                            affinity = affinity + "," + str(j - 2)
-                gpu_dict[i] = affinity
-
-    max_affinity = ""
-    for key, value in gpu_dict.items():
-        if len(value) > len(max_affinity):
-            max_affinity = value
-
-    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
-    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
-
-else:
-    print("xpu-smi topology failed")
-    sys.exit(255)
-
 
 # run python test
 def run(test_command):
@@ -56,8 +19,6 @@ def run(test_command):
     return result.returncode
 
 
-test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
-res += run(test_command)
 test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
 res += run(test_command)
 test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]

diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
@@ -1,36 +1,33 @@
 skip_dict = {
     "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
         "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
-        "test_parity_with_non_frozen_fsdp_xpu",
-        "test_parity_with_ddp_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
+    "../../../../test/distributed/fsdp/test_utils.py": None,
+    "../../../../test/distributed/fsdp/test_wrap.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-    ),
-    "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
-    "../../../../test/distributed/fsdp/test_utils.py": None,
-    "distributed/test_c10d_xccl.py": (
-        # https://github.com/intel/torch-xpu-ops/issues/2046
-        "test_unwaited",
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
+        "test_cached_state_dict",
+        "test_dp_state_dict_cpu_offload",
     ),
+    "../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
+    "../../../../test/distributed/_composable/test_checkpoint.py": None,
+    "../../../../test/distributed/_composable/test_contract.py": None,
+    "distributed/test_c10d_xccl.py": None,
     "distributed/test_c10d_ops_xccl.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
-    "../../../../test/distributed/test_functional_api.py": (
-        # depends on https://github.com/pytorch/pytorch/pull/159473
-        "test_tracing_with_fakepg_xpu",
-    ),
+    "../../../../test/distributed/test_functional_api.py": None,
+    "../../../../test/distributed/test_c10d_common.py": None,
     "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
     "../../../../test/distributed/_tools/test_mem_tracker.py": None,
     "../../../../test/distributed/_tools/test_memory_tracker.py": None,
+    "../../../../test/distributed/tensor/test_random_ops.py": None,
+    "../../../../test/distributed/tensor/test_math_ops.py": None,
 }