NVIDIA · NaderAlAwar · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
@@ -1,4 +1,4 @@
-name: CUB Benchmark Compare
+name: Benchmark Compare
 
 defaults:
   run:
@@ -18,7 +18,7 @@ on:
         default: "--cuda 13.1 --host gcc14"
         type: string
       arch:
-        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch"
+        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch"
         required: false
         default: "native"
         type: string
@@ -32,8 +32,13 @@ on:
         required: false
         default: ""
         type: string
-      filters:
-        description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')"
+      cub_filters:
+        description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')"
+        required: false
+        default: ""
+        type: string
+      python_filters:
+        description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')"
         required: false
         default: ""
         type: string
@@ -65,7 +70,7 @@ on:
         default: "--cuda 13.1 --host gcc14"
         type: string
       arch:
-        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch"
+        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch"
         required: false
         default: "native"
         type: string
@@ -79,8 +84,13 @@ on:
         required: false
         default: ""
         type: string
-      filters:
-        description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')"
+      cub_filters:
+        description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')"
+        required: false
+        default: ""
+        type: string
+      python_filters:
+        description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')"
         required: false
         default: ""
         type: string
@@ -146,7 +156,8 @@ jobs:
           INPUT_ARCH: ${{ inputs.arch }}
           INPUT_BASE_REF: ${{ inputs.base_ref }}
           INPUT_TEST_REF: ${{ inputs.test_ref }}
-          INPUT_FILTERS: ${{ inputs.filters }}
+          INPUT_CUB_FILTERS: ${{ inputs.cub_filters }}
+          INPUT_PYTHON_FILTERS: ${{ inputs.python_filters }}
           INPUT_RAW_ARGS: ${{ inputs.raw_args }}
           INPUT_NVBENCH_ARGS: ${{ inputs.nvbench_args }}
           INPUT_NVBENCH_COMPARE_ARGS: ${{ inputs.nvbench_compare_args }}
@@ -178,8 +189,12 @@ jobs:
             mapfile -d '' -t bench_args < "${parsed_raw_args_file}"
             rm -f "${parsed_raw_args_file}"
           else
-            if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" || -z "${INPUT_FILTERS}" ]]; then
-              echo "::error::When Raw Args is empty, Base Ref, Test Ref, and Filters must all be set."
+            if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" ]]; then
+              echo "::error::When Raw Args is empty, Base Ref and Test Ref must be set."
+              exit 2
+            fi
+            if [[ -z "${INPUT_CUB_FILTERS}" && -z "${INPUT_PYTHON_FILTERS}" ]]; then
+              echo "::error::At least one of CUB Filters or Python Filters must be set."
               exit 2
             fi
 
@@ -190,23 +205,40 @@ jobs:
               bench_args+=(--arch "${INPUT_ARCH}")
             fi
 
-            declare -a parsed_filters
-            parsed_filters=()
-            parsed_filters_file="$(mktemp "${RUNNER_TEMP}/bench-filters-XXXXXX")"
-            if ! parse_quoted_args "${INPUT_FILTERS}" > "${parsed_filters_file}"; then
-              rm -f "${parsed_filters_file}"
-              exit 2
+            # Add CUB filters as --cub-filter flags.
+            if [[ -n "${INPUT_CUB_FILTERS}" ]]; then
+              declare -a parsed_cub_filters
+              parsed_cub_filters=()
+              parsed_cub_filters_file="$(mktemp "${RUNNER_TEMP}/bench-cub-filters-XXXXXX")"
+              if ! parse_quoted_args "${INPUT_CUB_FILTERS}" > "${parsed_cub_filters_file}"; then
+                rm -f "${parsed_cub_filters_file}"
+                exit 2
+              fi
+              mapfile -d '' -t parsed_cub_filters < "${parsed_cub_filters_file}"
+              rm -f "${parsed_cub_filters_file}"
+
+              for cub_filter in "${parsed_cub_filters[@]}"; do
+                bench_args+=(--cub-filter "${cub_filter}")
+              done
             fi
-            mapfile -d '' -t parsed_filters < "${parsed_filters_file}"
-            rm -f "${parsed_filters_file}"
 
-            if [[ "${#parsed_filters[@]}" -eq 0 ]]; then
-              echo "::error::Filters must parse to at least one argument."
-              exit 2
+            # Add Python filters as --python-filter flags.
+            if [[ -n "${INPUT_PYTHON_FILTERS}" ]]; then
+              declare -a parsed_py_filters
+              parsed_py_filters=()
+              parsed_py_filters_file="$(mktemp "${RUNNER_TEMP}/bench-py-filters-XXXXXX")"
+              if ! parse_quoted_args "${INPUT_PYTHON_FILTERS}" > "${parsed_py_filters_file}"; then
+                rm -f "${parsed_py_filters_file}"
+                exit 2
+              fi
+              mapfile -d '' -t parsed_py_filters < "${parsed_py_filters_file}"
+              rm -f "${parsed_py_filters_file}"
+
+              for py_filter in "${parsed_py_filters[@]}"; do
+                bench_args+=(--python-filter "${py_filter}")
+              done
             fi
 
-            bench_args+=("${parsed_filters[@]}")
-
             if [[ -n "${INPUT_NVBENCH_ARGS}" ]]; then
               bench_args+=(--nvbench-args "${INPUT_NVBENCH_ARGS}")
             fi
@@ -234,7 +266,7 @@ jobs:
 
       - name: Show resolved benchmark args
         run: |
-          echo "Resolved args passed to ci/bench/cub.sh:"
+          echo "Resolved args passed to ci/bench/bench.sh:"
           while IFS= read -r arg; do
             echo "  ${arg}"
           done <<< "${{ steps.resolve-args.outputs.resolved_args }}"
@@ -263,7 +295,7 @@ jobs:
 
           base_sha_short="${base_sha_full:0:8}"
           test_sha_short="${test_sha_full:0:8}"
-          artifact_name="bench-cub-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}"
+          artifact_name="bench-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}"
 
           mkdir -p "bench-artifacts"
 
@@ -289,7 +321,7 @@ jobs:
             jsondiff \
             tabulate
 
-          ./ci/bench/cub.sh "${bench_args[@]}"
+          ./ci/bench/bench.sh "${bench_args[@]}"
           EOF
           chmod +x "${ci_script}"
 

@@ -76,7 +76,7 @@ jobs:
         run: |
           echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
           echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-      - name: Build CUB benchmark dispatch matrix
+      - name: Build benchmark dispatch matrix
         id: build-bench-matrix
         run: |
           { # Defaults:
@@ -86,7 +86,7 @@ jobs:
 
           # Compare ci/bench.yaml against its template. If they match, no benchmarks requested.
           if diff -q "ci/bench.template.yaml" "ci/bench.yaml" > /dev/null 2>&1; then
-            echo "ci/bench.yaml matches template; skipping CUB benchmark dispatch matrix."
+            echo "ci/bench.yaml matches template; skipping benchmark dispatch."
             exit 0
           fi
 
@@ -395,8 +395,8 @@ jobs:
           print('All CPU-only import tests passed!')
           "
 
-  dispatch-cub-bench:
-    name: CUB Bench Compare (${{ matrix.gpu }})
+  dispatch-bench:
+    name: Bench Compare (${{ matrix.gpu }})
     if: >-
       ${{
         needs.build-workflow.outputs.bench_enabled == 'true' &&
@@ -409,14 +409,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.build-workflow.outputs.bench_matrix) }}
-    uses: ./.github/workflows/bench_cub.yml
+    uses: ./.github/workflows/bench.yml
     with:
       gpu: ${{ matrix.gpu }}
       launch_args: ${{ matrix.launch_args }}
       arch: ${{ matrix.arch }}
       base_ref: ${{ matrix.base_ref }}
       test_ref: ${{ matrix.test_ref }}
-      filters: ${{ matrix.filters }}
+      cub_filters: ${{ matrix.cub_filters }}
+      python_filters: ${{ matrix.python_filters }}
       nvbench_args: ${{ matrix.nvbench_args }}
       nvbench_compare_args: ${{ matrix.nvbench_compare_args }}
 
@@ -428,15 +429,15 @@ jobs:
         needs.build-workflow.outputs.bench_enabled == 'true' &&
         needs.build-workflow.outputs.pr_number != ''
       }}
-    needs: [build-workflow, dispatch-cub-bench]
+    needs: [build-workflow, dispatch-bench]
     permissions:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
       - name: Determine outcome
         id: outcome
         run: |
-          result="${{ needs.dispatch-cub-bench.result }}"
+          result="${{ needs.dispatch-bench.result }}"
           case "${result}" in
             success)  icon=":white_check_mark:"; status="completed successfully" ;;
             failure)  icon=":x:";                status="had failures" ;;
@@ -452,7 +453,7 @@ jobs:
           message: |
             ## ${{ steps.outcome.outputs.icon }} Benchmark Results
 
-            CUB benchmark comparison ${{ steps.outcome.outputs.status }}.
+            Benchmark comparison ${{ steps.outcome.outputs.status }}.
 
             **[Results](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})**
             **[Artifacts](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)**
@@ -473,7 +474,7 @@ jobs:
       - verify-devcontainers
       - docs-build
       - test-cpu-import
-      - dispatch-cub-bench
+      - dispatch-bench
     runs-on: ubuntu-latest
     steps:
       - name: Check results
@@ -498,12 +499,12 @@ jobs:
           check_result "docs-build"           "success" "${{needs.docs-build.result}}"
           check_result "test-cpu-import"      "success" "${{needs.test-cpu-import.result}}"
 
-          expected_cub_bench_result="skipped"
+          expected_bench_result="skipped"
           if [[ "${{ needs.build-workflow.outputs.bench_enabled }}" == "true" ]] \
             && [[ "${{ toJSON(fromJSON(needs.build-workflow.outputs.bench_matrix).include) }}" != "[]" ]]; then
-            expected_cub_bench_result="success"
+            expected_bench_result="success"
           fi
-          check_result "dispatch-cub-bench" "${expected_cub_bench_result}" "${{needs.dispatch-cub-bench.result}}"
+          check_result "dispatch-bench" "${expected_bench_result}" "${{needs.dispatch-bench.result}}"
 
           bench_enabled="${{ needs.build-workflow.outputs.bench_enabled }}"
           if [[ "${bench_enabled}" == "true" ]]; then

diff --git a/ci-overview.md b/ci-overview.md
@@ -71,11 +71,11 @@ CCCL's CI uses [`sccache`](https://github.com/mozilla/sccache) to cache compiler
 
 CI jobs employ the build and test scripts in the `ci/` directory to build and run tests. These scripts provide a consistent entry point for building and testing in both local and CI environments. For more information on using these scripts, see the [CONTRIBUTING.md guide](CONTRIBUTING.md#building-and-testing).
 
-#### CUB Benchmark Comparison Workflow
+#### Benchmark Comparison Workflow
 
-The standalone CUB benchmark comparison workflow is implemented in `.github/workflows/bench_cub.yml` and uses:
+The benchmark comparison workflow is implemented in `.github/workflows/bench.yml` and uses:
 
-- `ci/bench/cub.sh`
+- `ci/bench/bench.sh`
 - `ci/bench/compare_git_refs.sh`
 - `ci/bench/compare_paths.sh`
 

@@ -1,8 +1,8 @@
-# # CUB PR benchmark request config.
+# # CCCL PR benchmark request config.
 #
 # ## Overview:
 #
-# This file is used to request CUB benchmark comparisons in PR CI.
+# This file is used to request benchmark comparisons in PR CI.
 #
 # This file must match ci/bench.template.yaml to merge.
 # CI branch protections will fail if they differ. Reset before merging.
@@ -17,18 +17,27 @@
 #
 # ## Quick start:
 #
-# 1. Add one or more benchmark regexes under benchmarks.filters.
+# 1. Add one or more benchmark regexes under cub and/or python filters.
 # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus.
 # 3. Push and inspect the dispatched benchmark jobs/artifacts.
 # 4. Remove/reset benchmark-request edits before final merge.
 
 benchmarks:
 
-  # Inclusive regex filters (required).
-  filters:
-    # Examples:
-    # - '^cub\.bench\.for_each\.base'
-    # - '^cub\.bench\.reduce\.(sum|min)\.'
+  # CUB C++ benchmark filters (regex matched against ninja target names).
+  cub:
+    filters:
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
+
+  # Python benchmark filters (regex matched against paths under benchmarks/).
+  python:
+    filters:
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:

@@ -1,8 +1,8 @@
-# # CUB PR benchmark request config.
+# # CCCL PR benchmark request config.
 #
 # ## Overview:
 #
-# This file is used to request CUB benchmark comparisons in PR CI.
+# This file is used to request benchmark comparisons in PR CI.
 #
 # This file must match ci/bench.template.yaml to merge.
 # CI branch protections will fail if they differ. Reset before merging.
@@ -17,18 +17,27 @@
 #
 # ## Quick start:
 #
-# 1. Add one or more benchmark regexes under benchmarks.filters.
+# 1. Add one or more benchmark regexes under cub and/or python filters.
 # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus.
 # 3. Push and inspect the dispatched benchmark jobs/artifacts.
 # 4. Remove/reset benchmark-request edits before final merge.
 
 benchmarks:
 
-  # Inclusive regex filters (required).
-  filters:
-    # Examples:
-    # - '^cub\.bench\.for_each\.base'
-    # - '^cub\.bench\.reduce\.(sum|min)\.'
+  # CUB C++ benchmark filters (regex matched against ninja target names).
+  cub:
+    filters:
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
+
+  # Python benchmark filters (regex matched against paths under benchmarks/).
+  python:
+    filters:
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus: