66from typing import Optional , Union
77from warnings import warn
88
9+ from cuda .core .experimental ._launch_config import LaunchConfig , _to_native_launch_config
10+ from cuda .core .experimental ._stream import Stream
911from cuda .core .experimental ._utils .clear_error_support import (
1012 assert_type ,
1113 assert_type_str_or_bytes ,
@@ -184,6 +186,59 @@ def cluster_scheduling_policy_preference(self, device_id: int = None) -> int:
184186 )
185187
186188
189+ class KernelOccupancy :
190+ """ """
191+
192+ def __new__ (self , * args , ** kwargs ):
193+ raise RuntimeError ("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs." )
194+
195+ slots = ("_handle" ,)
196+
197+ @classmethod
198+ def _init (cls , handle ):
199+ self = super ().__new__ (cls )
200+ self ._handle = handle
201+
202+ return self
203+
204+ def max_active_blocks_per_multiprocessor (self , block_size : int , dynamic_shared_memory_size : int ) -> int :
205+ """int : Occupancy of the kernel"""
206+ return handle_return (
207+ driver .cuOccupancyMaxActiveBlocksPerMultiprocessor (self ._handle , block_size , dynamic_shared_memory_size )
208+ )
209+
210+ # FIXME: better docstring needed
211+ def max_potential_block_size (self , dynamic_shared_memory_size : int , block_size_limit : int ) -> tuple [int ]:
212+ """(int, int): Suggested launch configuration for reasonable occupancy.
213+
214+ Returns the minimum grid size needed to achieve the maximum occupancy and
215+ the maximum block size that can achieve the maximum occupancy.
216+ """
217+ return handle_return (
218+ driver .cuOccupancyMaxPotentialBlockSize (self ._handle , None , dynamic_shared_memory_size , block_size_limit )
219+ )
220+
221+ def available_dynamic_shared_memory_per_block (self , num_blocks_per_multiprocessor : int , block_size : int ) -> int :
222+ """int: Dynamic shared memory available per block for given launch configuration."""
223+ return handle_return (
224+ driver .cuOccupancyAvailableDynamicSMemPerBlock (self ._handle , num_blocks_per_multiprocessor , block_size )
225+ )
226+
227+ def max_potential_cluster_size (self , config : LaunchConfig , stream : Optional [Stream ] = None ) -> int :
228+ """ "int: The maximum cluster size that can be launched for this kernel and launch configuration"""
229+ drv_cfg = _to_native_launch_config (config )
230+ if stream is not None :
231+ drv_cfg .hStream = stream ._handle
232+ return handle_return (driver .cuOccupancyMaxPotentialClusterSize (self ._handle , drv_cfg ))
233+
234+ def max_active_clusters (self , config : LaunchConfig , stream : Optional [Stream ] = None ) -> int :
235+ """ "int: The maximum number of clusters that could co-exist on the target device"""
236+ drv_cfg = _to_native_launch_config (config )
237+ if stream is not None :
238+ drv_cfg .hStream = stream ._handle
239+ return handle_return (driver .cuOccupancyMaxActiveClusters (self ._handle , drv_cfg ))
240+
241+
187242ParamInfo = namedtuple ("ParamInfo" , ["offset" , "size" ])
188243
189244
@@ -198,7 +253,7 @@ class Kernel:
198253
199254 """
200255
201- __slots__ = ("_handle" , "_module" , "_attributes" )
256+ __slots__ = ("_handle" , "_module" , "_attributes" , "_occupancy" )
202257
203258 def __new__ (self , * args , ** kwargs ):
204259 raise RuntimeError ("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs." )
@@ -211,6 +266,7 @@ def _from_obj(cls, obj, mod):
211266 ker ._handle = obj
212267 ker ._module = mod
213268 ker ._attributes = None
269+ ker ._occupancy = None
214270 return ker
215271
216272 @property
@@ -250,6 +306,13 @@ def arguments_info(self) -> list[ParamInfo]:
250306 _ , param_info = self ._get_arguments_info (param_info = True )
251307 return param_info
252308
309+ @property
310+ def occupancy (self ) -> KernelOccupancy :
311+ """Get the read-only attributes of this kernel."""
312+ if self ._occupancy is None :
313+ self ._occupancy = KernelOccupancy ._init (self ._handle )
314+ return self ._occupancy
315+
253316 # TODO: implement from_handle()
254317
255318
0 commit comments