Simplify LatchKernel and fix formatting.

Andy-Jost · Andy-Jost · commit f98a286bac60 · 2025-10-17T10:38:01.000-07:00
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
@@ -81,7 +81,6 @@ def verify_buffer(self, buffer, seed=None, value=None):
         self.sync_target.sync()
         assert libc.memcmp(ptr_test, ptr_expected, self.size) == 0
 
-
     @staticmethod
     def _ptr(buffer):
         """Get a pointer to the specified buffer."""
@@ -101,7 +100,7 @@ def _get_pattern_buffer(self, seed, value):
                 pattern_buffer = DummyUnifiedMemoryResource(self.device).allocate(self.size)
                 ptr = self._ptr(pattern_buffer)
                 for i in range(self.size):
-                    ptr[i] =  (seed + i) & 0xFF
+                    ptr[i] = (seed + i) & 0xFF
             self.pattern_buffers[key] = pattern_buffer
         return pattern_buffer
 
@@ -121,5 +120,3 @@ def compare_equal_buffers(buffer1, buffer2):
     ptr1 = ctypes.cast(int(buffer1.handle), ctypes.POINTER(ctypes.c_byte))
     ptr2 = ctypes.cast(int(buffer2.handle), ctypes.POINTER(ctypes.c_byte))
     return libc.memcmp(ptr1, ptr2, buffer1.size) == 0
-
-
diff --git a/cuda_core/tests/helpers/latch.py b/cuda_core/tests/helpers/latch.py
@@ -1,22 +1,28 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import ctypes
+
+import pytest
 from cuda.core.experimental import (
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
     ProgramOptions,
     launch,
 )
+
 import helpers
-import ctypes
+
 
 class LatchKernel:
     """
-    Manages a kernel that blocks progress until released.
+    Manages a kernel that blocks stream progress until released.
     """
 
     def __init__(self, device):
+        if helpers.CUDA_INCLUDE_PATH is None:
+            pytest.skip("need CUDA header")
         code = """
                #include <cuda/atomic>
 
@@ -44,26 +50,14 @@ def __init__(self, device):
         self.busy_wait_flag[0] = 0
 
     def launch(self, stream):
+        """Launch the latch kernel, blocking stream progress via busy waiting."""
         config = LaunchConfig(grid=1, block=1)
-        launch(stream, config, self.kernel, self.busy_wait_flag_address)
+        launch(stream, config, self.kernel, int(self.buffer.handle))
 
     def release(self):
+        """Release the latch, allowing stream progress."""
         self.busy_wait_flag[0] = 1
 
-    @property
-    def busy_wait_flag_address(self):
-        return int(self.buffer.handle)
-
     @property
     def busy_wait_flag(self):
-        return ctypes.cast(self.busy_wait_flag_address, ctypes.POINTER(ctypes.c_int32))
-
-    def close(self):
-       buffer = getattr(self, 'buffer', None)
-       if buffer is not None:
-           buffer.close()
-
-    def __del__(self):
-        self.close()
-
-
+        return ctypes.cast(int(self.buffer.handle), ctypes.POINTER(ctypes.c_int32))
diff --git a/cuda_core/tests/helpers/logging.py b/cuda_core/tests/helpers/logging.py
@@ -3,6 +3,7 @@
 
 import time
 
+
 class TimestampedLogger:
     """
     A logger that prefixes each output with a timestamp, containing the elapsed
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -1,29 +1,28 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from conftest import skipif_need_cuda_headers
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
-from helpers.buffers import make_scratch_buffer, compare_equal_buffers
-from helpers.latch import LatchKernel
-from helpers.logging import TimestampedLogger
-import ctypes
 import multiprocessing as mp
+
 import pytest
-import time
+from cuda.core.experimental import Device, EventOptions
+from helpers.buffers import compare_equal_buffers, make_scratch_buffer
+from helpers.latch import LatchKernel
+from helpers.logging import TimestampedLogger
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 
+
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
-    @skipif_need_cuda_headers  # libcu++
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device
         mr = ipc_memory_resource
         stream1 = device.create_stream()
+        latch = LatchKernel(device)
 
         # Start the child process.
         q_out, q_in = [mp.Queue() for _ in range(2)]
@@ -41,7 +40,6 @@ def test_main(self, ipc_device, ipc_memory_resource):
         q_out.put(buffer)
 
         # Stream 1:
-        latch = LatchKernel(device)
         log("enqueuing latch kernel on stream1")
         latch.launch(stream1)
         log("enqueuing copy on stream1")
@@ -69,7 +67,6 @@ def test_main(self, ipc_device, ipc_memory_resource):
         stream1.sync()
         assert compare_equal_buffers(target, twos)
 
-
     def child_main(self, log, q_in, q_out):
         log.prefix = " child: "
         log("ready")
@@ -99,13 +96,15 @@ def test_event_is_monadic(ipc_device):
 
     stream = device.create_stream()
     e = stream.record(options={"ipc_enabled": True})
-    with pytest.raises(TypeError, match=r"^IPC-enabled events should not be re-recorded, instead create a new event by supplying options\.$"):
+    with pytest.raises(
+        TypeError,
+        match=r"^IPC-enabled events should not be re-recorded, instead create a new event by supplying options\.$",
+    ):
         stream.record(e)
 
 
 @pytest.mark.parametrize(
-    "options", [ {"ipc_enabled": True, "enable_timing": True},
-                 EventOptions(ipc_enabled=True, enable_timing=True)]
+    "options", [{"ipc_enabled": True, "enable_timing": True}, EventOptions(ipc_enabled=True, enable_timing=True)]
 )
 def test_event_timing_disabled(ipc_device, options):
     """Check that IPC-enabled events cannot be created with timing enabled."""
@@ -114,11 +113,13 @@ def test_event_timing_disabled(ipc_device, options):
     with pytest.raises(TypeError, match=r"^IPC-enabled events cannot use timing\.$"):
         stream.record(options=options)
 
+
 class TestIpcEventProperties:
     """
     Check that event properties are properly set after transfer to a child
     process.
     """
+
     @pytest.mark.parametrize("busy_waited_sync", [True, False])
     @pytest.mark.parametrize("use_options_cls", [True, False])
     @pytest.mark.parametrize("use_option_kw", [True, False])
@@ -132,13 +133,12 @@ def test_main(self, ipc_device, busy_waited_sync, use_options_cls, use_option_kw
         process.start()
 
         # Create an event and send it.
-        options = \
-            EventOptions(ipc_enabled=True, busy_waited_sync=busy_waited_sync) \
-            if use_options_cls else \
-            {"ipc_enabled": True, "busy_waited_sync": busy_waited_sync}
-        e = stream.record(options=options) \
-            if use_option_kw else \
-            stream.record(None, options)
+        options = (
+            EventOptions(ipc_enabled=True, busy_waited_sync=busy_waited_sync)
+            if use_options_cls
+            else {"ipc_enabled": True, "busy_waited_sync": busy_waited_sync}
+        )
+        e = stream.record(options=options) if use_option_kw else stream.record(None, options)
         q_out.put(e)
 
         # Check its properties.
@@ -156,28 +156,17 @@ def test_main(self, ipc_device, busy_waited_sync, use_options_cls, use_option_kw
     def child_main(self, q_in, q_out):
         device = Device()
         device.set_current()
-        stream = device.create_stream()
 
         # Get the event.
         e = q_in.get(timeout=CHILD_TIMEOUT_SEC)
 
         # Send its properties.
-        props = (e.get_ipc_descriptor(),
-                 e.is_ipc_enabled,
-                 e.is_timing_disabled,
-                 e.is_sync_busy_waited,
-                 e.device,
-                 e.context,)
+        props = (
+            e.get_ipc_descriptor(),
+            e.is_ipc_enabled,
+            e.is_timing_disabled,
+            e.is_sync_busy_waited,
+            e.device,
+            e.context,
+        )
         q_out.put(props)
-
-
-
-# TODO: daisy chain processes
-
-if __name__ == "__main__":
-    mp.set_start_method("spawn")
-    device = Device()
-    device.set_current()
-    TestIpcEventWithLatch().test_main(device)
-
-
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -1,9 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import multiprocessing as mp
+
 from cuda.core.experimental import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
-import multiprocessing as mp
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -13,7 +13,6 @@
 )
 from helpers.latch import LatchKernel
 
-from conftest import skipif_need_cuda_headers
 from cuda_python_test_helpers import IS_WSL
 
 
@@ -115,7 +114,6 @@ def test_error_timing_recorded():
         event3 - event2
 
 
-@skipif_need_cuda_headers  # libcu++
 def test_error_timing_incomplete():
     device = Device()
     device.set_current()
diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
@@ -2,16 +2,18 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import time
+
+import pytest
 from cuda.core.experimental import Device
+from helpers.buffers import PatternGen, compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
 from helpers.logging import TimestampedLogger
-from helpers.buffers import make_scratch_buffer, compare_equal_buffers, PatternGen
-import time
-import pytest
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 NBYTES = 64
 
+
 def test_latchkernel():
     """Test LatchKernel."""
     log = TimestampedLogger()
@@ -38,6 +40,7 @@ def test_latchkernel():
     assert compare_equal_buffers(target, ones)
     log("done")
 
+
 def test_patterngen_seeds():
     """Test PatternGen with seed argument."""
     device = Device()
@@ -49,10 +52,11 @@ def test_patterngen_seeds():
     for i in range(256):
         pgen.fill_buffer(buffer, seed=i)
         pgen.verify_buffer(buffer, seed=i)
-        for j in range(i+1, 256):
+        for j in range(i + 1, 256):
             with pytest.raises(AssertionError):
                 pgen.verify_buffer(buffer, seed=j)
 
+
 def test_patterngen_values():
     """Test PatternGen with value argument, also compare_equal_buffers."""
     device = Device()
@@ -64,4 +68,3 @@ def test_patterngen_values():
     pgen = PatternGen(device, NBYTES)
     pgen.verify_buffer(ones, value=1)
     pgen.verify_buffer(twos, value=2)
-
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -13,7 +13,6 @@
     np = None
 import ctypes
 import platform
-from helpers.buffers import DummyUnifiedMemoryResource
 
 import pytest
 from cuda.core.experimental import (
@@ -28,6 +27,7 @@
 from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
+from helpers.buffers import DummyUnifiedMemoryResource
 
 from cuda_python_test_helpers import IS_WSL, supports_ipc_mempool