Queue up 'skip reasons' (#1352)

mdboom · web-flow · commit 4886636b572e · 2025-12-10T02:35:20.000Z
diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py
@@ -71,14 +71,9 @@ def get_devices(device_info):
 
 
 @pytest.fixture
-def for_all_devices(device_info):
+def all_devices(device_info):
     with NVMLInitializer():
-        unique_devices = set()
-        for device_id in get_devices(device_info):
-            if device_id not in unique_devices:
-                unique_devices.add(device_id)
-                yield device_id
-                # RestoreDefaultEnvironment.restore()
+        yield sorted(list(set(get_devices(device_info))))
 
 
 @pytest.fixture
diff --git a/cuda_bindings/tests/nvml/test_compute_mode.py b/cuda_bindings/tests/nvml/test_compute_mode.py
@@ -15,15 +15,19 @@
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
-def test_compute_mode_supported_nonroot(for_all_devices):
-    device = for_all_devices
-
-    try:
-        original_compute_mode = nvml.device_get_compute_mode(device)
-    except nvml.NotSupportedError:
-        pytest.skip("nvmlDeviceGetComputeMode not supported")
-
-    for cm in COMPUTE_MODES:
-        with pytest.raises(nvml.NoPermissionError):
-            nvml.device_set_compute_mode(device, cm)
-        assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
+def test_compute_mode_supported_nonroot(all_devices):
+    skip_reasons = set()
+    for device in all_devices:
+        try:
+            original_compute_mode = nvml.device_get_compute_mode(device)
+        except nvml.NotSupportedError:
+            skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
+            continue
+
+        for cm in COMPUTE_MODES:
+            with pytest.raises(nvml.NoPermissionError):
+                nvml.device_set_compute_mode(device, cm)
+            assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
+
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))
diff --git a/cuda_bindings/tests/nvml/test_gpu.py b/cuda_bindings/tests/nvml/test_gpu.py
@@ -22,19 +22,24 @@ def test_gpu_get_module_id(nvml_init):
         assert isinstance(module_id, int)
 
 
-def test_gpu_get_platform_info(for_all_devices):
-    device = for_all_devices
+def test_gpu_get_platform_info(all_devices):
+    skip_reasons = set()
+    for device in all_devices:
+        if util.is_vgpu(device):
+            skip_reasons.add(f"Not supported on vGPU device {device}")
+            continue
 
-    if util.is_vgpu(device):
-        pytest.skip("Not supported on vGPU device")
+        # TODO
+        # if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
+        #     test_utils.skip_test("Not supported on chip before Blackwell")
 
-    # TODO
-    # if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
-    #     test_utils.skip_test("Not supported on chip before Blackwell")
+        try:
+            platform_info = nvml.device_get_platform_info(device)
+        except nvml.NotSupportedError:
+            skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
+            continue
 
-    try:
-        platform_info = nvml.device_get_platform_info(device)
-    except nvml.NotSupportedError:
-        pytest.skip("Not supported returned, likely NVLink is disabled.")
+        assert isinstance(platform_info, nvml.PlatformInfo_v2)
 
-    assert isinstance(platform_info, nvml.PlatformInfo_v2)
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))
diff --git a/cuda_bindings/tests/nvml/test_nvlink.py b/cuda_bindings/tests/nvml/test_nvlink.py
@@ -5,26 +5,25 @@
 from cuda.bindings import _nvml as nvml
 
 
-def test_nvlink_get_link_count(for_all_devices):
+def test_nvlink_get_link_count(all_devices):
     """
     Checks that the link count of the device is same.
     """
-    device = for_all_devices
+    for device in all_devices:
+        fields = nvml.FieldValue(1)
+        fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
+        value = nvml.device_get_field_values(device, fields)[0]
+        assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
+            f"Unexpected return {value.nvml_return} for link count field query"
+        )
 
-    fields = nvml.FieldValue(1)
-    fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
-    value = nvml.device_get_field_values(device, fields)[0]
-    assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
-        f"Unexpected return {value.nvml_return} for link count field query"
-    )
+        # Use the alternative argument to device_get_field_values
+        value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
+        assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
+            f"Unexpected return {value.nvml_return} for link count field query"
+        )
 
-    # Use the alternative argument to device_get_field_values
-    value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
-    assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
-        f"Unexpected return {value.nvml_return} for link count field query"
-    )
-
-    # The feature_nvlink_supported detection is not robust, so we
-    # can't be more specific about how many links we should find.
-    if value.nvml_return == nvml.Return.SUCCESS:
-        assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
+        # The feature_nvlink_supported detection is not robust, so we
+        # can't be more specific about how many links we should find.
+        if value.nvml_return == nvml.Return.SUCCESS:
+            assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
diff --git a/cuda_bindings/tests/nvml/test_page_retirement.py b/cuda_bindings/tests/nvml/test_page_retirement.py
@@ -20,42 +20,55 @@ def supports_page_retirement(device):
         return False
 
 
-def test_page_retirement_notsupported(for_all_devices):
+def test_page_retirement_notsupported(all_devices):
     """
     Verifies that on platforms that don't supports page retirement, APIs will return Not Supported
     """
-    device = for_all_devices
+    skip_reasons = set()
 
-    if supports_page_retirement(device):
-        pytest.skip("page_retirement not supported")
+    for device in all_devices:
+        if supports_page_retirement(device):
+            skip_reasons.add(f"page_retirement is supported for {device}")
+            continue
 
-    if not util.supports_ecc(device):
-        pytest.skip("device doesn't support ECC")
+        if not util.supports_ecc(device):
+            skip_reasons.add(f"device doesn't support ECC for {device}")
+            continue
 
-    with pytest.raises(nvml.NotSupportedError):
-        for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
-            nvml.device_get_retired_pages(device, source)
+        with pytest.raises(nvml.NotSupportedError):
+            for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
+                nvml.device_get_retired_pages(device, source)
 
-    with pytest.raises(nvml.NotSupportedError):
-        nvml.device_get_retired_pages_pending_status(device)
+        with pytest.raises(nvml.NotSupportedError):
+            nvml.device_get_retired_pages_pending_status(device)
 
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))
 
-def test_page_retirement_supported(for_all_devices):
+
+def test_page_retirement_supported(all_devices):
     """
     Verifies that on platforms that support page_retirement, APIs will return success
     """
-    device = for_all_devices
+    skip_reasons = set()
 
-    if not supports_page_retirement(device):
-        pytest.skip("page_retirement not supported")
+    for device in all_devices:
+        if not supports_page_retirement(device):
+            skip_reasons.add(f"page_retirement not supported for {device}")
+            continue
 
-    if not util.supports_ecc(device):
-        pytest.skip("device doesn't support ECC")
+        if not util.supports_ecc(device):
+            skip_reasons.add(f"device doesn't support ECC for {device}")
+            continue
 
-    try:
-        for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
-            nvml.device_get_retired_pages(device, source)
-    except nvml.NotSupportedError:
-        pytest.skip("Exception case: Page retirment is not supported in this GPU")
+        try:
+            for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
+                nvml.device_get_retired_pages(device, source)
+        except nvml.NotSupportedError:
+            skip_reasons.add(f"Exception case: Page retirement is not supported in this GPU {device}")
+            continue
+
+        nvml.device_get_retired_pages_pending_status(device)
 
-    nvml.device_get_retired_pages_pending_status(device)
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))