Fix LaunchConfig.grid unit conversion when cluster is set (#868)

Copilot · web-flow · commit bb1fe80fcfb6 · 2025-08-20T21:49:01.000-04:00
diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py
@@ -35,10 +35,20 @@ def _lazy_init():
 class LaunchConfig:
     """Customizable launch options.
 
+    Note
+    ----
+    When cluster is specified, the grid parameter represents the number of
+    clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) ->
+    block (threads). Each dimension in grid specifies clusters in the grid, each dimension in
+    cluster specifies blocks per cluster, and each dimension in block specifies
+    threads per block.
+
     Attributes
     ----------
     grid : Union[tuple, int]
-        Collection of threads that will execute a kernel function.
+        Collection of threads that will execute a kernel function. When cluster
+        is not specified, this represents the number of blocks, otherwise
+        this represents the number of clusters.
     cluster : Union[tuple, int]
         Group of blocks (Thread Block Cluster) that will execute on the same
         GPU Processing Cluster (GPC). Blocks within a cluster have access to
@@ -89,16 +99,29 @@ def __post_init__(self):
 def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
     _lazy_init()
     drv_cfg = driver.CUlaunchConfig()
-    drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
-    drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
-    drv_cfg.sharedMemBytes = config.shmem_size
-    attrs = []  # TODO: support more attributes
+
+    # Handle grid dimensions and cluster configuration
     if config.cluster:
+        # Convert grid from cluster units to block units
+        grid_blocks = (
+            config.grid[0] * config.cluster[0],
+            config.grid[1] * config.cluster[1],
+            config.grid[2] * config.cluster[2],
+        )
+        drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = grid_blocks
+
+        # Set up cluster attribute
         attr = driver.CUlaunchAttribute()
         attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         dim = attr.value.clusterDim
         dim.x, dim.y, dim.z = config.cluster
-        attrs.append(attr)
+        attrs = [attr]
+    else:
+        drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
+        attrs = []
+
+    drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
+    drv_cfg.sharedMemBytes = config.shmem_size
     if config.cooperative_launch:
         attr = driver.CUlaunchAttribute()
         attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
@@ -7,6 +7,7 @@ Release Notes
 .. toctree::
    :maxdepth: 3
 
+   release/0.X.Y-notes
    release/0.3.2-notes
    release/0.3.1-notes
    release/0.3.0-notes
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -0,0 +1,39 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. currentmodule:: cuda.core.experimental
+
+``cuda.core`` 0.X.Y Release Notes
+=================================
+
+Released on TBD
+
+
+Highlights
+----------
+
+- Fix for :class:`LaunchConfig` grid parameter unit conversion when thread block clusters are used.
+
+
+Breaking Changes
+----------------
+
+- **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``.
+
+
+New features
+------------
+
+None.
+
+
+New examples
+------------
+
+None.
+
+
+Fixes and enhancements
+----------------------
+
+- Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867).
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -5,14 +5,23 @@
 # ################################################################################
 #
 # This demo illustrates the use of thread block clusters in the CUDA launch
-# configuration.
+# configuration and verifies that the correct grid size is passed to the kernel.
 #
 # ################################################################################
 
 import os
 import sys
 
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+import numpy as np
+
+from cuda.core.experimental import (
+    Device,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    launch,
+)
 
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
@@ -26,17 +35,34 @@
 if os.path.isdir(cccl_include):
     include_path.insert(0, cccl_include)
 
-# print cluster info using a kernel
+# print cluster info using a kernel and store results in pinned memory
 code = r"""
 #include <cooperative_groups.h>
 
 namespace cg = cooperative_groups;
 
 extern "C"
-__global__ void check_cluster_info() {
+__global__ void check_cluster_info(unsigned int* grid_dims, unsigned int* cluster_dims, unsigned int* block_dims) {
     auto g = cg::this_grid();
     auto b = cg::this_thread_block();
+
     if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) {
+        // Store grid dimensions (in blocks)
+        grid_dims[0] = g.dim_blocks().x;
+        grid_dims[1] = g.dim_blocks().y;
+        grid_dims[2] = g.dim_blocks().z;
+
+        // Store cluster dimensions
+        cluster_dims[0] = g.dim_clusters().x;
+        cluster_dims[1] = g.dim_clusters().y;
+        cluster_dims[2] = g.dim_clusters().z;
+
+        // Store block dimensions (in threads)
+        block_dims[0] = b.dim_threads().x;
+        block_dims[1] = b.dim_threads().y;
+        block_dims[2] = b.dim_threads().z;
+
+        // Also print to console
         printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z);
         printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z);
         printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z);
@@ -70,8 +96,49 @@
 block = 32
 config = LaunchConfig(grid=grid, cluster=cluster, block=block)
 
+# allocate pinned memory to store kernel results
+pinned_mr = LegacyPinnedMemoryResource()
+element_size = np.dtype(np.uint32).itemsize
+
+# allocate 3 uint32 values each for grid, cluster, and block dimensions
+grid_buffer = pinned_mr.allocate(3 * element_size)
+cluster_buffer = pinned_mr.allocate(3 * element_size)
+block_buffer = pinned_mr.allocate(3 * element_size)
+
+# create NumPy arrays from the pinned memory
+grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32)
+cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32)
+block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32)
+
+# initialize arrays to zero
+grid_dims[:] = 0
+cluster_dims[:] = 0
+block_dims[:] = 0
+
 # launch kernel on the default stream
-launch(dev.default_stream, config, ker)
+launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
 dev.sync()
 
+# verify results
+print("\nResults stored in pinned memory:")
+print(f"Grid dimensions (blocks): {tuple(grid_dims)}")
+print(f"Cluster dimensions: {tuple(cluster_dims)}")
+print(f"Block dimensions (threads): {tuple(block_dims)}")
+
+# verify that grid conversion worked correctly:
+# LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster)
+expected_grid_blocks = grid * cluster  # 4 * 2 = 8
+actual_grid_blocks = grid_dims[0]
+
+print("\nVerification:")
+print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster")
+print(f"Expected total blocks: {expected_grid_blocks}")
+print(f"Actual total blocks: {actual_grid_blocks}")
+
+if actual_grid_blocks == expected_grid_blocks:
+    print("✓ Grid conversion is correct!")
+else:
+    print("✗ Grid conversion failed!")
+    sys.exit(1)
+
 print("done!")
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -23,6 +23,7 @@
     launch,
 )
 from cuda.core.experimental._memory import _SynchronousMemoryResource
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 
 def test_launch_config_init(init_cuda):
@@ -59,6 +60,68 @@ def test_launch_config_shmem_size():
     assert config.shmem_size == 0
 
 
+def test_launch_config_cluster_grid_conversion(init_cuda):
+    """Test that LaunchConfig preserves original grid values and conversion happens in native config."""
+    try:
+        # Test case 1: 1D - Issue #867 example
+        config = LaunchConfig(grid=4, cluster=2, block=32)
+        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+        assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
+        assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
+
+        # Test case 2: 2D grid and cluster
+        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+        assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
+        assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
+
+        # Test case 3: 3D full specification
+        config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
+        assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
+        assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
+
+        # Test case 4: Identity case
+        config = LaunchConfig(grid=1, cluster=1, block=32)
+        assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
+
+        # Test case 5: No cluster (should not convert grid)
+        config = LaunchConfig(grid=4, block=32)
+        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+        assert config.cluster is None
+
+    except CUDAError:
+        pytest.skip("Driver or GPU not new enough for thread block clusters")
+
+
+def test_launch_config_native_conversion(init_cuda):
+    """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
+    from cuda.core.experimental._launch_config import _to_native_launch_config
+
+    try:
+        # Test case 1: 1D - Issue #867 example
+        config = LaunchConfig(grid=4, cluster=2, block=32)
+        native_config = _to_native_launch_config(config)
+        assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
+        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+
+        # Test case 2: 2D grid and cluster
+        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+        native_config = _to_native_launch_config(config)
+        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+        assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
+        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+
+        # Test case 3: No cluster (should not convert grid)
+        config = LaunchConfig(grid=4, block=32)
+        native_config = _to_native_launch_config(config)
+        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+
+    except CUDAError:
+        pytest.skip("Driver or GPU not new enough for thread block clusters")
+
+
 def test_launch_invalid_values(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")