diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index c7b0f8346c..1533614adb 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -27,6 +27,8 @@ from cuda.core.experimental._utils.cuda_utils import ( ) +# TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python, +# but it seems it is very convenient to expose them for testing purposes... _tls = threading.local() _lock = threading.Lock() cdef bint _is_cuInit = False @@ -55,7 +57,8 @@ cdef class DeviceProperties: cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr): """Retrieve the attribute value directly from the driver.""" cdef int val - HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) return val cdef _get_cached_attribute(self, attr): @@ -912,7 +915,8 @@ cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: primary_ctxs = _tls.primary_ctxs = [0] * total cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) if ctx == NULL: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + with nogil: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) primary_ctxs[dev_id] = (ctx) return ctx @@ -948,7 +952,7 @@ class Device: def __new__(cls, device_id: Optional[int] = None): global _is_cuInit if _is_cuInit is False: - with _lock: + with _lock, nogil: HANDLE_RETURN(cydriver.cuInit(0)) _is_cuInit = True @@ -956,11 +960,13 @@ class Device: cdef cydriver.CUdevice dev cdef cydriver.CUcontext ctx if device_id is None: - err = cydriver.cuCtxGetDevice(&dev) + with nogil: + err = cydriver.cuCtxGetDevice(&dev) if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) assert (ctx) == NULL device_id = 0 # cudart behavior else: @@ -973,18 +979,20 @@ class Device: try: devices = _tls.devices except AttributeError: - HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] for dev_id in range(total): device = super().__new__(cls) device._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - HANDLE_RETURN( - cydriver.cuDeviceGetAttribute( - &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id + with nogil: + HANDLE_RETURN( + cydriver.cuDeviceGetAttribute( + &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id + ) ) - ) if attr == 1: device._mr = DeviceMemoryResource(dev_id) else: @@ -1005,16 +1013,18 @@ class Device: f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_current_context(self, check_consistency=False) -> driver.CUcontext: + def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext: cdef cydriver.CUcontext ctx - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - if ctx == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") cdef cydriver.CUdevice dev - if check_consistency: - HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if (dev) != self._id: - raise CUDAError("Internal error (current device is not equal to Device.device_id)") + cdef cydriver.CUdevice this_dev = self._id + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: + raise CUDAError("No context is bound to the calling CPU thread.") + if check_consistency: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + if dev != this_dev: + raise CUDAError("Internal error (current device is not equal to Device.device_id)") return driver.CUcontext(ctx) @property @@ -1043,10 +1053,12 @@ class Device: """ cdef cydriver.CUuuid uuid - IF CUDA_CORE_BUILD_MAJOR == "12": - HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) - ELSE: # 13.0+ - HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) + cdef cydriver.CUdevice this_dev = self._id + with nogil: + IF CUDA_CORE_BUILD_MAJOR == "12": + HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, this_dev)) + ELSE: # 13.0+ + HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, this_dev)) cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes)) cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 @@ -1058,7 +1070,10 @@ class Device: # Use 256 characters to be consistent with CUDA Runtime cdef int LENGTH = 256 cdef bytes name = bytes(LENGTH) - HANDLE_RETURN(cydriver.cuDeviceGetName(name, LENGTH, self._id)) + cdef char* name_ptr = name + cdef cydriver.CUdevice this_dev = self._id + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev)) name = name.split(b"\0")[0] return name.decode() @@ -1161,7 +1176,8 @@ class Device: >>> # ... do work on device 0 ... """ - cdef cydriver.CUcontext _ctx + cdef cydriver.CUcontext prev_ctx + cdef cydriver.CUcontext curr_ctx if ctx is not None: # TODO: revisit once Context is cythonized assert_type(ctx, Context) @@ -1170,16 +1186,19 @@ class Device: "the provided context was created on the device with" f" id={ctx._id}, which is different from the target id={self._id}" ) - # _ctx is the previous context - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx)) - HANDLE_RETURN(cydriver.cuCtxPushCurrent((ctx._handle))) + # prev_ctx is the previous context + curr_ctx = (ctx._handle) + with nogil: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) + HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) self._has_inited = True - if _ctx != NULL: - return Context._from_ctx((_ctx), self._id) + if prev_ctx != NULL: + return Context._from_ctx((prev_ctx), self._id) else: # use primary ctx - _ctx = _get_primary_context(self._id) - HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx)) + curr_ctx = _get_primary_context(self._id) + with nogil: + HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx)) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd new file mode 100644 index 0000000000..0972063af3 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class Event: + + cdef: + cydriver.CUevent _handle + bint _timing_disabled + bint _busy_waited + int _device_id + object _ctx_handle + + cpdef close(self) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index db243717f6..962556597a 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -20,9 +20,7 @@ from cuda.core.experimental._context import Context from cuda.core.experimental._utils.cuda_utils import ( CUDAError, driver, - handle_return, ) -import sys if TYPE_CHECKING: import cuda.bindings from cuda.core.experimental._device import Device @@ -81,13 +79,6 @@ cdef class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - cdef: - cydriver.CUevent _handle - bint _timing_disabled - bint _busy_waited - int _device_id - object _ctx_handle - def __cinit__(self): self._handle = (NULL) @@ -109,24 +100,21 @@ cdef class Event: self._busy_waited = True if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) + with nogil: + HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) self._device_id = device_id self._ctx_handle = ctx_handle return self - cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return - if self._handle != NULL: - HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) - self._handle = (NULL) - cpdef close(self): """Destroy the event.""" - self._shutdown_safe_close(is_shutting_down=None) + if self._handle != NULL: + with nogil: + HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) + self._handle = (NULL) - def __del__(self): - self._shutdown_safe_close() + def __dealloc__(self): + self.close() def __isub__(self, other): return NotImplemented @@ -137,7 +125,8 @@ cdef class Event: def __sub__(self, other: Event): # return self - other (in milliseconds) cdef float timing - err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + with nogil: + err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) if err == 0: return timing else: @@ -187,12 +176,14 @@ cdef class Event: has been completed. """ - HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - result = cydriver.cuEventQuery(self._handle) + with nogil: + result = cydriver.cuEventQuery(self._handle) if result == cydriver.CUresult.CUDA_SUCCESS: return True if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 3fdc1410f7..39afa67234 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -4,12 +4,19 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t +cimport cpython +from libc.limits cimport ULLONG_MAX +from libc.stdint cimport uintptr_t, intptr_t +from libc.string cimport memset, memcpy + +from cuda.bindings cimport cydriver + +from cuda.core.experimental._stream cimport Stream as cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN, ) -import sys from dataclasses import dataclass from typing import Optional, TypeVar, Union, TYPE_CHECKING @@ -20,7 +27,6 @@ import cython import multiprocessing import os import platform -import sys import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream, default_stream @@ -33,9 +39,6 @@ if TYPE_CHECKING: from ._device import Device import uuid -# TODO: define a memory property mixin class and make Buffer and -# MemoryResource both inherit from it - PyCapsule = TypeVar("PyCapsule") """Represent the capsule type.""" @@ -44,7 +47,56 @@ DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" -cdef class Buffer: +cdef class _cyBuffer: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef: + intptr_t _ptr + size_t _size + _cyMemoryResource _mr + object _ptr_obj + + +cdef class _cyMemoryResource: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef Buffer _allocate(self, size_t size, cyStream stream): + raise NotImplementedError + + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + raise NotImplementedError + + +class MemoryResourceAttributes(abc.ABC): + + @property + @abc.abstractmethod + def is_device_accessible(self) -> bool: + """bool: True if buffers allocated by this resource can be accessed on the device.""" + ... + + @property + @abc.abstractmethod + def is_host_accessible(self) -> bool: + """bool: True if buffers allocated by this resource can be accessed on the host.""" + ... + + @property + @abc.abstractmethod + def device_id(self) -> int: + """int: The device ordinal for which this memory resource is responsible. + + Raises + ------ + RuntimeError + If the resource is not bound to a specific device. + """ + ... + + +cdef class Buffer(_cyBuffer, MemoryResourceAttributes): """Represent a handle to allocated memory. This generic object provides a unified representation for how @@ -53,12 +105,7 @@ cdef class Buffer: Support for data interchange mechanisms are provided by DLPack. """ - - cdef: - uintptr_t _ptr - size_t _size - object _mr - object _ptr_obj + cdef dict __dict__ # required if inheriting from both Cython/Python classes def __init__(self, *args, **kwargs): raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") @@ -66,23 +113,14 @@ cdef class Buffer: @classmethod def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None): cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) + self._ptr = (int(ptr)) self._ptr_obj = ptr self._size = size self._mr = mr return self - def __del__(self): - self._shutdown_safe_close() - - cdef _shutdown_safe_close(self, stream: Stream = None, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return - if self._ptr and self._mr is not None: - self._mr.deallocate(self._ptr, self._size, stream) - self._ptr = 0 - self._mr = None - self._ptr_obj = None + def __dealloc__(self): + self.close() def __reduce__(self): return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) @@ -99,7 +137,15 @@ cdef class Buffer: The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ - self._shutdown_safe_close(stream, is_shutting_down=None) + if self._ptr and self._mr is not None: + # To be fixed in NVIDIA/cuda-python#1032 + if stream is None: + stream = Stream.__new__(Stream) + ((stream))._handle = (0) + self._mr._deallocate(self._ptr, self._size, stream) + self._ptr = 0 + self._mr = None + self._ptr_obj = None @property def handle(self) -> DevicePointerT: @@ -110,7 +156,13 @@ cdef class Buffer: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Buffer.handle)``. """ - return self._ptr_obj + if self._ptr_obj is not None: + return self._ptr_obj + elif self._ptr: + return self._ptr + else: + # contract: Buffer is closed + return 0 @property def size(self) -> int: @@ -147,20 +199,23 @@ cdef class Buffer: """Export a buffer allocated for sharing between processes.""" if not self._mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - err, ptr = driver.cuMemPoolExportPointer(self.handle) - raise_if_driver_error(err) - return IPCBufferDescriptor._init(ptr.reserved, self.size) + cdef cydriver.CUmemPoolPtrExportData data + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) + cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) + return IPCBufferDescriptor._init(data_b, self.size) @classmethod - def from_ipc_descriptor(cls, mr: MemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: + def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor) -> Buffer: """Import a buffer that was exported from another process.""" if not mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") - share_data = driver.CUmemPoolPtrExportData() - share_data.reserved = ipc_buffer._reserved - err, ptr = driver.cuMemPoolImportPointer(mr._mempool_handle, share_data) - raise_if_driver_error(err) - return Buffer.from_handle(ptr, ipc_buffer.size, mr) + cdef cydriver.CUmemPoolPtrExportData share_data + memcpy(share_data.reserved, (ipc_buffer._reserved), sizeof(share_data.reserved)) + cdef cydriver.CUdeviceptr ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &share_data)) + return Buffer.from_handle(ptr, ipc_buffer.size, mr) def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -284,7 +339,7 @@ cdef class Buffer: return Buffer._init(ptr, size, mr=mr) -class MemoryResource(abc.ABC): +cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): """Abstract base class for memory resources that manage allocation and deallocation of buffers. Subclasses must implement methods for allocating and deallocation, as well as properties @@ -293,14 +348,10 @@ class MemoryResource(abc.ABC): hold a reference to self, the buffer properties are retrieved simply by looking up the underlying memory resource's respective property.) """ + cdef dict __dict__ # required if inheriting from both Cython/Python classes - @abc.abstractmethod - def __init__(self, *args, **kwargs): - """Initialize the memory resource. - - Subclasses may use additional arguments to configure the resource. - """ - ... + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + self.deallocate(ptr, size, stream) @abc.abstractmethod def allocate(self, size_t size, stream: Stream = None) -> Buffer: @@ -340,37 +391,12 @@ class MemoryResource(abc.ABC): """ ... - @property - @abc.abstractmethod - def is_device_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the device.""" - ... - - @property - @abc.abstractmethod - def is_host_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the host.""" - ... - - @property - @abc.abstractmethod - def device_id(self) -> int: - """int: The device ordinal for which this memory resource is responsible. - - Raises - ------ - RuntimeError - If the resource is not bound to a specific device. - """ - ... - # IPC is currently only supported on Linux. On other platforms, the IPC handle # type is set equal to the no-IPC handle type. +cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ + if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE -_NOIPC_HANDLE_TYPE = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE -_IPC_HANDLE_TYPE = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ - if platform.system() == "Linux" else _NOIPC_HANDLE_TYPE cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" @@ -424,8 +450,7 @@ cdef class IPCAllocationHandle: self._handle = -1 self._uuid = None - def __del__(self): - """Close the handle.""" + def __dealloc__(self): self.close() def __int__(self) -> int: @@ -474,6 +499,7 @@ cdef class DeviceMemoryResourceOptions: max_size : cython.int = 0 +# TODO: cythonize this? class DeviceMemoryResourceAttributes: def __init__(self, *args, **kwargs): raise RuntimeError("DeviceMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") @@ -491,8 +517,9 @@ class DeviceMemoryResourceAttributes: def fget(self) -> property_type: mr = self._mr() if mr is None: - raise RuntimeError("DeviceMemoryResource is expired") - err, value = driver.cuMemPoolGetAttribute(mr._mempool_handle, attr_enum) + raise RuntimeError("DeviceMemoryResource is expired") + # TODO: this implementation does not allow lowering to Cython + nogil + err, value = driver.cuMemPoolGetAttribute(mr.handle, attr_enum) raise_if_driver_error(err) return property_type(value) return property(fget=fget, doc=stub.__doc__) @@ -539,7 +566,8 @@ class DeviceMemoryResourceAttributes: # and the serialized buffer descriptor. _ipc_registry = {} -class DeviceMemoryResource(MemoryResource): + +cdef class DeviceMemoryResource(MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -617,97 +645,109 @@ class DeviceMemoryResource(MemoryResource): methods. The reconstruction procedure uses the registry to find the associated MMR. """ - __slots__ = ("_dev_id", "_mempool_handle", "_attributes", "_ipc_handle_type", - "_mempool_owned", "_is_mapped", "_uuid", "_alloc_handle") + cdef: + int _dev_id + cydriver.CUmemoryPool _mempool_handle + object _attributes + cydriver.CUmemAllocationHandleType _ipc_handle_type + bint _mempool_owned + bint _is_mapped + object _uuid + IPCAllocationHandle _alloc_handle + dict __dict__ # required if inheriting from both Cython/Python classes + object __weakref__ + + def __cinit__(self): + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None def __init__(self, device_id: int | Device, options=None): - device_id = getattr(device_id, 'device_id', device_id) + cdef int dev_id = getattr(device_id, 'device_id', device_id) opts = check_or_create_options( DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True ) + cdef cydriver.cuuint64_t current_threshold + cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX + cdef cydriver.CUmemPoolProps properties if opts is None: # Get the current memory pool. - self._dev_id = device_id - self._mempool_handle = None - self._attributes = None - self._ipc_handle_type = _NOIPC_HANDLE_TYPE + self._dev_id = dev_id + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None - err, self._mempool_handle = driver.cuDeviceGetMemPool(self.device_id) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) - # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back - # to the OS when there are no active suballocations, causing performance issues. - # Check current release threshold - err, current_threshold = driver.cuMemPoolGetAttribute( - self._mempool_handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD - ) - raise_if_driver_error(err) - # If threshold is 0 (default), set it to maximum to retain memory in the pool - if int(current_threshold) == 0: - err, = driver.cuMemPoolSetAttribute( - self._mempool_handle, - driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + # Check current release threshold + HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( + self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) ) - raise_if_driver_error(err) + + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if current_threshold == 0: + HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( + self._mempool_handle, + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + &max_threshold + )) else: # Create a new memory pool. - if opts.ipc_enabled and _IPC_HANDLE_TYPE == _NOIPC_HANDLE_TYPE: + if opts.ipc_enabled and _IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: raise RuntimeError("IPC is not available on {platform.system()}") - properties = driver.CUmemPoolProps() - properties.allocType = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else _NOIPC_HANDLE_TYPE - properties.location = driver.CUmemLocation() - properties.location.id = device_id - properties.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) + properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + properties.location.id = dev_id + properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE properties.maxSize = opts.max_size - properties.win32SecurityAttributes = 0 + properties.win32SecurityAttributes = NULL properties.usage = 0 - self._dev_id = device_id - self._mempool_handle = None - self._attributes = None + self._dev_id = dev_id self._ipc_handle_type = properties.handleTypes self._mempool_owned = True - self._is_mapped = False - self._uuid = None - self._alloc_handle = None - err, self._mempool_handle = driver.cuMemPoolCreate(properties) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) + # TODO: should we also set the threshold here? if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid + self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid - def __del__(self): + def __dealloc__(self): self.close() - def close(self): + cpdef close(self): """Close the device memory resource and destroy the associated memory pool if owned.""" - if self._mempool_handle is not None: - try: - if self._mempool_owned: - err, = driver.cuMemPoolDestroy(self._mempool_handle) - raise_if_driver_error(err) - finally: - if self.is_mapped: - self.unregister() - self._dev_id = None - self._mempool_handle = None - self._attributes = None - self._ipc_handle_type = _NOIPC_HANDLE_TYPE - self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None + if self._mempool_handle == NULL: + return + try: + if self._mempool_owned: + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle)) + finally: + if self.is_mapped: + self.unregister() + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None def __reduce__(self): return DeviceMemoryResource.from_registry, (self.uuid,) @@ -783,30 +823,30 @@ class DeviceMemoryResource(MemoryResource): """ # Quick exit for registry hits. uuid = getattr(alloc_handle, 'uuid', None) - self = _ipc_registry.get(uuid) - if self is not None: - return self + mr = _ipc_registry.get(uuid) + if mr is not None: + return mr device_id = getattr(device_id, 'device_id', device_id) - self = cls.__new__(cls) + cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) self._dev_id = device_id - self._mempool_handle = None - self._attributes = None self._ipc_handle_type = _IPC_HANDLE_TYPE self._mempool_owned = True self._is_mapped = True - self._uuid = None - self._alloc_handle = None # only used for non-imported + #self._alloc_handle = None # only used for non-imported - err, self._mempool_handle = driver.cuMemPoolImportFromShareableHandle(int(alloc_handle), _IPC_HANDLE_TYPE, 0) - raise_if_driver_error(err) + cdef int handle = int(alloc_handle) + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( + &(self._mempool_handle), handle, _IPC_HANDLE_TYPE, 0) + ) if uuid is not None: registered = self.register(uuid) assert registered is self return self - def get_allocation_handle(self) -> IPCAllocationHandle: + cpdef IPCAllocationHandle get_allocation_handle(self): """Export the memory pool handle to be shared (requires IPC). The handle can be used to share the memory pool with other processes. @@ -816,13 +856,19 @@ class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ + # Note: This is Linux only (int for file descriptor) + cdef int alloc_handle + if self._alloc_handle is None: if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") if self._is_mapped: raise RuntimeError("Imported memory resource cannot be exported") - err, alloc_handle = driver.cuMemPoolExportToShareableHandle(self._mempool_handle, _IPC_HANDLE_TYPE, 0) - raise_if_driver_error(err) + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( + &alloc_handle, self._mempool_handle, _IPC_HANDLE_TYPE, 0) + ) try: assert self._uuid is None import uuid @@ -833,6 +879,18 @@ class DeviceMemoryResource(MemoryResource): raise return self._alloc_handle + cdef Buffer _allocate(self, size_t size, cyStream stream): + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) + cdef Buffer buf = Buffer.__new__(Buffer) + buf._ptr = (devptr) + buf._ptr_obj = None + buf._size = size + buf._mr = self + return buf + def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. @@ -854,11 +912,15 @@ class DeviceMemoryResource(MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - err, ptr = driver.cuMemAllocFromPoolAsync(size, self._mempool_handle, stream.handle) - raise_if_driver_error(err) - return Buffer._init(ptr, size, self) + return self._allocate(size, stream) - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr = ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + + cpdef deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -873,8 +935,7 @@ class DeviceMemoryResource(MemoryResource): """ if stream is None: stream = default_stream() - err, = driver.cuMemFreeAsync(ptr, stream.handle) - raise_if_driver_error(err) + self._deallocate(ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: @@ -889,9 +950,9 @@ class DeviceMemoryResource(MemoryResource): return self._dev_id @property - def handle(self) -> cuda.bindings.driver.CUmemoryPool: + def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" - return self._mempool_handle + return driver.CUmemoryPool((self._mempool_handle)) @property def is_handle_owned(self) -> bool: @@ -919,7 +980,7 @@ class DeviceMemoryResource(MemoryResource): @property def is_ipc_enabled(self) -> bool: """Whether this memory resource has IPC enabled.""" - return self._ipc_handle_type != _NOIPC_HANDLE_TYPE + return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE def _deep_reduce_device_memory_resource(mr): diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 6b8a7f0f60..2fe77e07b4 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -6,3 +6,19 @@ from cuda.bindings cimport cydriver cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* + + +cdef class Stream: + + cdef: + cydriver.CUstream _handle + object _owner + bint _builtin + int _nonblocking + int _priority + cydriver.CUdevice _device_id + cydriver.CUcontext _ctx_handle + + cpdef close(self) + cdef int _get_context(self) except?-1 nogil + cdef int _get_device_and_context(self) except?-1 diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 737fd13f95..c17af4ce46 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -4,17 +4,18 @@ from __future__ import annotations -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, INT32_MIN from cuda.bindings cimport cydriver +from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, + CU_CONTEXT_INVALID, + get_device_from_ctx, HANDLE_RETURN, ) -import sys - import cython import os import warnings @@ -30,8 +31,6 @@ from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( driver, - get_device_from_ctx, - handle_return, ) @@ -109,20 +108,15 @@ cdef class Stream: New streams should instead be created through a :obj:`~_device.Device` object, or created directly through using an existing handle using Stream.from_handle(). - """ - - cdef: - cydriver.CUstream _handle - object _owner - object _builtin - object _nonblocking - object _priority - object _device_id - object _ctx_handle - def __cinit__(self): self._handle = (NULL) + self._owner = None + self._builtin = False + self._nonblocking = -1 # lazy init'd + self._priority = INT32_MIN # lazy init'd + self._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd + self._ctx_handle = CU_CONTEXT_INVALID # lazy init'd def __init__(self, *args, **kwargs): raise RuntimeError( @@ -134,31 +128,19 @@ cdef class Stream: def _legacy_default(cls): cdef Stream self = Stream.__new__(cls) self._handle = (cydriver.CU_STREAM_LEGACY) - self._owner = None self._builtin = True - self._nonblocking = None # delayed - self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self @classmethod def _per_thread_default(cls): cdef Stream self = Stream.__new__(cls) self._handle = (cydriver.CU_STREAM_PER_THREAD) - self._owner = None self._builtin = True - self._nonblocking = None # delayed - self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) - self._owner = None - self._builtin = False if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") @@ -166,10 +148,6 @@ cdef class Stream: self._handle = _try_to_get_stream_ptr(obj) # TODO: check if obj is created under the current context/device self._owner = obj - self._nonblocking = None # delayed - self._priority = None # delayed - self._device_id = None # delayed - self._ctx_handle = None # delayed return self cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") @@ -177,37 +155,29 @@ cdef class Stream: priority = opts.priority flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call cdef int high, low - HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) + cdef int prio if priority is not None: - if not (low <= priority <= high): + prio = priority + if not (low <= prio <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: - priority = high + prio = high cdef cydriver.CUstream s - HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) self._handle = s - self._owner = None - self._nonblocking = nonblocking - self._priority = priority - self._device_id = device_id - self._ctx_handle = None # delayed + self._nonblocking = int(nonblocking) + self._priority = prio + self._device_id = device_id if device_id is not None else self._device_id return self - def __del__(self): - self._shutdown_safe_close() - - cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): - if is_shutting_down and is_shutting_down(): - return - - if self._owner is None: - if self._handle and not self._builtin: - HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) - else: - self._owner = None - self._handle = (NULL) + def __dealloc__(self): + self.close() cpdef close(self): """Destroy the stream. @@ -216,11 +186,17 @@ cdef class Stream: object will instead have their references released. """ - self._shutdown_safe_close(is_shutting_down=None) + if self._owner is None: + if self._handle and not self._builtin: + with nogil: + HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) + else: + self._owner = None + self._handle = (NULL) def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, int(self.handle)) + return (0, (self._handle)) @property def handle(self) -> cuda.bindings.driver.CUstream: @@ -237,26 +213,29 @@ cdef class Stream: def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" cdef unsigned int flags - if self._nonblocking is None: - HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) + if self._nonblocking == -1: + with nogil: + HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: self._nonblocking = False - return self._nonblocking + return bool(self._nonblocking) @property def priority(self) -> int: """Return the stream priority.""" cdef int prio - if self._priority is None: - HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) + if self._priority == INT32_MIN: + with nogil: + HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -282,9 +261,10 @@ cdef class Stream: # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: self._get_device_and_context() - event = Event._init(self._device_id, self._ctx_handle, options) - # TODO: revisit after Event is cythonized - HANDLE_RETURN(cydriver.cuEventRecord((event.handle), self._handle)) + event = Event._init((self._device_id), (self._ctx_handle), options) + cdef cydriver.CUevent e = ((event))._handle + with nogil: + HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle)) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -299,11 +279,12 @@ cdef class Stream: """ cdef cydriver.CUevent event cdef cydriver.CUstream stream - cdef bint discard_event if isinstance(event_or_stream, Event): event = (event_or_stream.handle) - discard_event = False + with nogil: + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) else: if isinstance(event_or_stream, Stream): stream = (event_or_stream.handle) @@ -316,14 +297,12 @@ cdef class Stream: f" got {type(event_or_stream)}" ) from e stream = (s.handle) - HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) - discard_event = True - - # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) - if discard_event: - HANDLE_RETURN(cydriver.cuEventDestroy(event)) + with nogil: + HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuEventDestroy(event)) @property def device(self) -> Device: @@ -338,21 +317,23 @@ cdef class Stream: """ from cuda.core.experimental._device import Device # avoid circular import self._get_device_and_context() - return Device(self._device_id) - - cdef int _get_context(Stream self) except?-1: - # TODO: consider making self._ctx_handle typed? - cdef cydriver.CUcontext ctx - if self._ctx_handle is None: - HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) - self._ctx_handle = driver.CUcontext(ctx) + return Device((self._device_id)) + + cdef int _get_context(self) except?-1 nogil: + if self._ctx_handle == CU_CONTEXT_INVALID: + HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle))) return 0 - cdef int _get_device_and_context(Stream self) except?-1: - if self._device_id is None: - # Get the stream context first - self._get_context() - self._device_id = get_device_from_ctx(self._ctx_handle) + cdef int _get_device_and_context(self) except?-1: + cdef cydriver.CUcontext curr_ctx + if self._device_id == cydriver.CU_DEVICE_INVALID: + with nogil: + # Get the current context + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx)) + # Get the stream's context (self.ctx_handle is populated) + self._get_context() + # Get the stream's device (may require a context-switching dance) + self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) return 0 @property @@ -360,7 +341,7 @@ cdef class Stream: """Return the :obj:`~_context.Context` associated with this stream.""" self._get_context() self._get_device_and_context() - return Context._from_ctx(self._ctx_handle, self._device_id) + return Context._from_ctx((self._ctx_handle), (self._device_id)) @staticmethod def from_handle(handle: int) -> Stream: diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index bf570965f9..442fc70e20 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -12,11 +12,19 @@ ctypedef fused supported_error_type: cydriver.CUresult -cdef int HANDLE_RETURN(supported_error_type err) except?-1 +# mimic CU_DEVICE_INVALID +cdef cydriver.CUcontext CU_CONTEXT_INVALID = (-2) + + +cdef cydriver.CUdevice get_device_from_ctx( + cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil + + +cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil # TODO: stop exposing these within the codebase? -cpdef int _check_driver_error(error) except?-1 +cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil cpdef int _check_runtime_error(error) except?-1 cpdef int _check_nvrtc_error(error) except?-1 diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index c095e75645..ddb7683bc5 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -52,32 +52,33 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] -cdef int HANDLE_RETURN(supported_error_type err) except?-1: +cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil: if supported_error_type is cydriver.CUresult: if err != cydriver.CUresult.CUDA_SUCCESS: return _check_driver_error(err) -cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS -cpdef inline int _check_driver_error(error) except?-1: - if error == _DRIVER_SUCCESS: +cpdef inline int _check_driver_error(cydriver.CUresult error) except?-1 nogil: + if error == cydriver.CUresult.CUDA_SUCCESS: return 0 - name_err, name = driver.cuGetErrorName(error) - if name_err != _DRIVER_SUCCESS: + cdef const char* name + name_err = cydriver.cuGetErrorName(error, &name) + if name_err != cydriver.CUresult.CUDA_SUCCESS: raise CUDAError(f"UNEXPECTED ERROR CODE: {error}") - name = name.decode() - expl = DRIVER_CU_RESULT_EXPLANATIONS.get(int(error)) - if expl is not None: - raise CUDAError(f"{name}: {expl}") - desc_err, desc = driver.cuGetErrorString(error) - if desc_err != _DRIVER_SUCCESS: - raise CUDAError(f"{name}") - desc = desc.decode() - raise CUDAError(f"{name}: {desc}") + with gil: + # TODO: consider lower this to Cython + expl = DRIVER_CU_RESULT_EXPLANATIONS.get(int(error)) + if expl is not None: + raise CUDAError(f"{name.decode()}: {expl}") + cdef const char* desc + desc_err = cydriver.cuGetErrorString(error, &desc) + if desc_err != cydriver.CUresult.CUDA_SUCCESS: + raise CUDAError(f"{name.decode()}") + raise CUDAError(f"{name.decode()}: {desc.decode()}") cpdef inline int _check_runtime_error(error) except?-1: @@ -191,20 +192,23 @@ def precondition(checker: Callable[..., None], str what="") -> Callable: return outer -def get_device_from_ctx(ctx_handle) -> int: +cdef cydriver.CUdevice get_device_from_ctx( + cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil: """Get device ID from the given ctx.""" - from cuda.core.experimental._device import Device # avoid circular import - - prev_ctx = Device().context._handle - switch_context = int(ctx_handle) != int(prev_ctx) - if switch_context: - assert prev_ctx == handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(ctx_handle)) - device_id = int(handle_return(driver.cuCtxGetDevice())) - if switch_context: - assert ctx_handle == handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(prev_ctx)) - return device_id + cdef bint switch_context = (curr_ctx != target_ctx) + cdef cydriver.CUcontext ctx + cdef cydriver.CUdevice target_dev + with nogil: + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + assert curr_ctx == ctx + HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx)) + HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + assert target_ctx == ctx + HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) + return target_dev def is_sequence(obj): diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 109e8a44c9..dd509be865 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -22,7 +22,6 @@ Breaking Changes - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. - Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. -- When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident. New features diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index 391a183bb7..c4abd06e2c 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -128,8 +128,8 @@ cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream # Verify buffers are properly closed -assert device_buffer.handle is None, "Device buffer should be closed" -assert pinned_buffer.handle is None, "Pinned buffer should be closed" -assert new_device_buffer.handle is None, "New device buffer should be closed" +assert device_buffer.handle == 0, "Device buffer should be closed" +assert pinned_buffer.handle == 0, "Pinned buffer should be closed" +assert new_device_buffer.handle == 0, "New device buffer should be closed" print("Memory management example completed!") diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index bfead7dd31..756b2f8104 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -4,6 +4,7 @@ import contextlib import gc import multiprocessing as mp +import platform try: import psutil @@ -11,15 +12,12 @@ HAVE_PSUTIL = False else: HAVE_PSUTIL = True - import pytest -from cuda.core.experimental import _memory -from cuda.core.experimental._utils.cuda_utils import driver CHILD_TIMEOUT_SEC = 20 NBYTES = 64 -USING_FDS = _memory._IPC_HANDLE_TYPE == driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +USING_FDS = platform.system() == "Linux" skip_if_unrunnable = pytest.mark.skipif( not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" ) diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index cf48661f41..2698ccdc9d 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -370,4 +370,4 @@ def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_reso cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream # Verify buffer is properly closed - assert buffer.handle is None, f"{name} buffer should be closed" + assert buffer.handle == 0, f"{name} buffer should be closed" diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5886433b22..26cd2a1393 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -223,7 +223,7 @@ def test_buffer_copy_from(): def buffer_close(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) buffer.close() - assert buffer.handle is None + assert buffer.handle == 0 assert buffer.memory_resource is None @@ -426,7 +426,6 @@ def test_mempool_attributes_ownership(mempool_device): device = mempool_device mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) attributes = mr.attributes - old_handle = mr._mempool_handle mr.close() del mr @@ -436,15 +435,9 @@ def test_mempool_attributes_ownership(mempool_device): # Even when a new object is created (we found a case where the same # mempool handle was really reused). - mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) - with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): - _ = attributes.used_mem_high - - # Even if we stuff the original handle into a new class. - mr._mempool_handle, old_handle = old_handle, mr._mempool_handle + mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE)) # noqa: F841 with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): _ = attributes.used_mem_high - mr._mempool_handle = old_handle # Ensure that memory views dellocate their reference to dlpack tensors diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index e5ed99acfb..845522cb43 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -51,7 +51,7 @@ def test_stream_record(init_cuda): def test_stream_record_invalid_event(init_cuda): stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(AttributeError): + with pytest.raises(TypeError): stream.record(event="invalid_event") diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index b7eab9e753..d5195ed872 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -19,7 +19,6 @@ def test_system_singleton(): def test_driver_version(): driver_version = system.driver_version - print(driver_version) version = handle_return(driver.cuDriverGetVersion()) expected_driver_version = (version // 1000, (version % 1000) // 10) assert driver_version == expected_driver_version, "Driver version does not match expected value"