Skip to content

Commit 5bd64a7

Browse files
Introduce _module.KernelOccupancy class
This class defines kernel occupancy query methods. - max_active_blocks_per_multiprocessor - max_potential_block_size - available_dynamic_shared_memory_per_block - max_potential_cluster_size - max_active_clusters Implementation is based on driver API. The following occupancy-related driver functions are not used - `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` - `cuOccupancyMaxPotentialBlockSizeWithFlags` In `cuOccupancyMaxPotentialBlockSize`, only constant dynamic shared-memory size is supported for now. Supporting variable dynamic shared-memory size that depends on the block size is deferred until design is resolved.
1 parent de7b3c9 commit 5bd64a7

File tree

1 file changed

+64
-1
lines changed

1 file changed

+64
-1
lines changed

cuda_core/cuda/core/experimental/_module.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from typing import Optional, Union
77
from warnings import warn
88

9+
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
10+
from cuda.core.experimental._stream import Stream
911
from cuda.core.experimental._utils.clear_error_support import (
1012
assert_type,
1113
assert_type_str_or_bytes,
@@ -184,6 +186,59 @@ def cluster_scheduling_policy_preference(self, device_id: int = None) -> int:
184186
)
185187

186188

189+
class KernelOccupancy:
190+
""" """
191+
192+
def __new__(self, *args, **kwargs):
193+
raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")
194+
195+
slots = ("_handle",)
196+
197+
@classmethod
198+
def _init(cls, handle):
199+
self = super().__new__(cls)
200+
self._handle = handle
201+
202+
return self
203+
204+
def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int:
205+
"""int : Occupancy of the kernel"""
206+
return handle_return(
207+
driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size)
208+
)
209+
210+
# FIXME: better docstring needed
211+
def max_potential_block_size(self, dynamic_shared_memory_size: int, block_size_limit: int) -> tuple[int]:
212+
"""(int, int): Suggested launch configuration for reasonable occupancy.
213+
214+
Returns the minimum grid size needed to achieve the maximum occupancy and
215+
the maximum block size that can achieve the maximum occupancy.
216+
"""
217+
return handle_return(
218+
driver.cuOccupancyMaxPotentialBlockSize(self._handle, None, dynamic_shared_memory_size, block_size_limit)
219+
)
220+
221+
def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int:
222+
"""int: Dynamic shared memory available per block for given launch configuration."""
223+
return handle_return(
224+
driver.cuOccupancyAvailableDynamicSMemPerBlock(self._handle, num_blocks_per_multiprocessor, block_size)
225+
)
226+
227+
def max_potential_cluster_size(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int:
228+
""" "int: The maximum cluster size that can be launched for this kernel and launch configuration"""
229+
drv_cfg = _to_native_launch_config(config)
230+
if stream is not None:
231+
drv_cfg.hStream = stream._handle
232+
return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg))
233+
234+
def max_active_clusters(self, config: LaunchConfig, stream: Optional[Stream] = None) -> int:
235+
""" "int: The maximum number of clusters that could co-exist on the target device"""
236+
drv_cfg = _to_native_launch_config(config)
237+
if stream is not None:
238+
drv_cfg.hStream = stream._handle
239+
return handle_return(driver.cuOccupancyMaxActiveClusters(self._handle, drv_cfg))
240+
241+
187242
ParamInfo = namedtuple("ParamInfo", ["offset", "size"])
188243

189244

@@ -198,7 +253,7 @@ class Kernel:
198253
199254
"""
200255

201-
__slots__ = ("_handle", "_module", "_attributes")
256+
__slots__ = ("_handle", "_module", "_attributes", "_occupancy")
202257

203258
def __new__(self, *args, **kwargs):
204259
raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")
@@ -211,6 +266,7 @@ def _from_obj(cls, obj, mod):
211266
ker._handle = obj
212267
ker._module = mod
213268
ker._attributes = None
269+
ker._occupancy = None
214270
return ker
215271

216272
@property
@@ -250,6 +306,13 @@ def arguments_info(self) -> list[ParamInfo]:
250306
_, param_info = self._get_arguments_info(param_info=True)
251307
return param_info
252308

309+
@property
310+
def occupancy(self) -> KernelOccupancy:
311+
"""Get the read-only attributes of this kernel."""
312+
if self._occupancy is None:
313+
self._occupancy = KernelOccupancy._init(self._handle)
314+
return self._occupancy
315+
253316
# TODO: implement from_handle()
254317

255318

0 commit comments

Comments
 (0)