NVIDIA · cpcloud · Apr 8, 2026 · Apr 6, 2026 · Apr 7, 2026 · Apr 8, 2026
diff --git a/cuda_bindings/docs/build_docs.sh b/cuda_bindings/docs/build_docs.sh
@@ -25,6 +25,10 @@ if [[ -z "${SPHINX_CUDA_BINDINGS_VER}" ]]; then
                                       | awk -F'+' '{print $1}')
 fi
 
+if [[ "${LATEST_ONLY}" == "1" && -z "${BUILD_PREVIEW:-}" && -z "${BUILD_LATEST:-}" ]]; then
+    export BUILD_LATEST=1
+fi
+
 # build the docs (in parallel)
 SPHINXOPTS="-j 4 -d build/.doctrees" make html
 

diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
@@ -26,6 +26,15 @@
 release = os.environ["SPHINX_CUDA_BINDINGS_VER"]
 
 
+def _github_examples_ref():
+    if int(os.environ.get("BUILD_PREVIEW", 0)) or int(os.environ.get("BUILD_LATEST", 0)):
+        return "main"
+    return f"v{release}"
+
+
+GITHUB_EXAMPLES_REF = _github_examples_ref()
+
+
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -99,6 +108,10 @@
 # skip cmdline prompts
 copybutton_exclude = ".linenos, .gp"
 
+rst_epilog = f"""
+.. |cuda_bindings_github_ref| replace:: {GITHUB_EXAMPLES_REF}
+"""
+
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),

diff --git a/cuda_bindings/docs/source/examples.rst b/cuda_bindings/docs/source/examples.rst
@@ -0,0 +1,68 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+Examples
+========
+
+This page links to the ``cuda.bindings`` examples shipped in the
+`cuda-python repository <https://github.com/NVIDIA/cuda-python/tree/|cuda_bindings_github_ref|/cuda_bindings/examples>`_.
+Use it as a quick index when you want a runnable sample for a specific API area
+or CUDA feature.
+
+Introduction
+------------
+
+- `clock_nvrtc.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/clock_nvrtc.py>`_
+  uses NVRTC-compiled CUDA code and the device clock to time a reduction
+  kernel.
+- `simple_cubemap_texture.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_cubemap_texture.py>`_
+  demonstrates cubemap texture sampling and transformation.
+- `simple_p2p.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_p2p.py>`_
+  shows peer-to-peer memory access and transfers between multiple GPUs.
+- `simple_zero_copy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_zero_copy.py>`_
+  uses zero-copy mapped host memory for vector addition.
+- `system_wide_atomics.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/system_wide_atomics.py>`_
+  demonstrates system-wide atomic operations on managed memory.
+- `vector_add_drv.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/vector_add_drv.py>`_
+  uses the CUDA Driver API and unified virtual addressing for vector addition.
+- `vector_add_mmap.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/vector_add_mmap.py>`_
+  uses virtual memory management APIs such as ``cuMemCreate`` and
+  ``cuMemMap`` for vector addition.
+
+Concepts and techniques
+-----------------------
+
+- `stream_ordered_allocation.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/2_Concepts_and_Techniques/stream_ordered_allocation.py>`_
+  demonstrates ``cudaMallocAsync`` and ``cudaFreeAsync`` together with
+  memory-pool release thresholds.
+
+CUDA features
+-------------
+
+- `global_to_shmem_async_copy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py>`_
+  compares asynchronous global-to-shared-memory copy strategies in matrix
+  multiplication kernels.
+- `simple_cuda_graphs.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/3_CUDA_Features/simple_cuda_graphs.py>`_
+  shows both manual CUDA graph construction and stream-capture-based replay.
+
+Libraries and tools
+-------------------
+
+- `conjugate_gradient_multi_block_cg.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py>`_
+  implements a conjugate-gradient solver with cooperative groups and
+  multi-block synchronization.
+- `nvidia_smi.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py>`_
+  uses NVML to implement a Python subset of ``nvidia-smi``.
+
+Advanced and interoperability
+-----------------------------
+
+- `iso_fd_modelling.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/iso_fd_modelling.py>`_
+  runs isotropic finite-difference wave propagation across multiple GPUs with
+  peer-to-peer halo exchange.
+- `jit_program.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/jit_program.py>`_
+  JIT-compiles a SAXPY kernel with NVRTC and launches it through the Driver
+  API.
+- `numba_emm_plugin.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/extra/numba_emm_plugin.py>`_
+  shows how to back Numba's EMM interface with the NVIDIA CUDA Python Driver
+  API.
diff --git a/cuda_bindings/docs/source/index.rst b/cuda_bindings/docs/source/index.rst
@@ -11,6 +11,7 @@
    release
    install
    overview
+   examples
    motivation
    environment_variables
    api

diff --git a/cuda_bindings/docs/source/overview.rst b/cuda_bindings/docs/source/overview.rst
@@ -31,7 +31,8 @@ API <http://docs.nvidia.com/cuda/cuda-driver-api/index.html>`_, manually create
 CUDA context and all required resources on the GPU, then launch the compiled
 CUDA C++ code and retrieve the results from the GPU. Now that you have an
 overview, jump into a commonly used example for parallel programming:
-`SAXPY <https://developer.nvidia.com/blog/six-ways-saxpy/>`_.
+`SAXPY <https://developer.nvidia.com/blog/six-ways-saxpy/>`_. For more
+end-to-end samples, see the :doc:`examples` page.
 
 The first thing to do is import the `Driver
 API <https://docs.nvidia.com/cuda/cuda-driver-api/index.html>`_ and
@@ -520,7 +521,10 @@ CUDA objects
 
 Certain CUDA kernels use native CUDA types as their parameters such as ``cudaTextureObject_t``. These types require special handling since they're neither a primitive ctype nor a custom user type. Since ``cuda.bindings`` exposes each of them as Python classes, they each implement ``getPtr()`` and ``__int__()``. These two callables used to support the NumPy and ctypes approach. The difference between each call is further described under `Tips and Tricks <https://nvidia.github.io/cuda-python/cuda-bindings/latest/tips_and_tricks.html#>`_.
 
-For this example, lets use the ``transformKernel`` from `examples/0_Introduction/simpleCubemapTexture_test.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py>`_:
+For this example, lets use the ``transformKernel`` from
+`simple_cubemap_texture.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_bindings_github_ref|/cuda_bindings/examples/0_Introduction/simple_cubemap_texture.py>`_.
+The :doc:`examples` page links to more samples covering textures, graphs,
+memory mapping, and multi-GPU workflows.
 
 .. code-block:: python
 

diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock
diff --git a/cuda_core/docs/build_docs.sh b/cuda_core/docs/build_docs.sh
@@ -24,6 +24,10 @@ if [[ -z "${SPHINX_CUDA_CORE_VER}" ]]; then
                                   | awk -F'+' '{print $1}')
 fi
 
+if [[ "${LATEST_ONLY}" == "1" && -z "${BUILD_PREVIEW:-}" && -z "${BUILD_LATEST:-}" ]]; then
+    export BUILD_LATEST=1
+fi
+
 # build the docs. Allow callers to override SPHINXOPTS for serial/debug runs.
 if [[ -z "${SPHINXOPTS:-}" ]]; then
     SPHINXOPTS="-j 4 -d build/.doctrees"

diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
@@ -26,6 +26,15 @@
 release = os.environ["SPHINX_CUDA_CORE_VER"]
 
 
+def _github_examples_ref():
+    if int(os.environ.get("BUILD_PREVIEW", 0)) or int(os.environ.get("BUILD_LATEST", 0)):
+        return "main"
+    return f"cuda-core-v{release}"
+
+
+GITHUB_EXAMPLES_REF = _github_examples_ref()
+
+
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -97,6 +106,10 @@
 # skip cmdline prompts
 copybutton_exclude = ".linenos, .gp"
 
+rst_epilog = f"""
+.. |cuda_core_github_ref| replace:: {GITHUB_EXAMPLES_REF}
+"""
+
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),

diff --git a/cuda_core/docs/source/examples.rst b/cuda_core/docs/source/examples.rst
@@ -0,0 +1,59 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+Examples
+========
+
+This page links to the ``cuda.core`` examples shipped in the
+`cuda-python repository <https://github.com/NVIDIA/cuda-python/tree/|cuda_core_github_ref|/cuda_core/examples>`_.
+Use it as a quick index when you want a runnable starting point for a specific
+workflow.
+
+Compilation and kernel launch
+-----------------------------
+
+- `vector_add.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/vector_add.py>`_
+  compiles and launches a simple vector-add kernel with CuPy arrays.
+- `saxpy.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/saxpy.py>`_
+  JIT-compiles a templated SAXPY kernel and launches both float and double
+  instantiations.
+- `pytorch_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/pytorch_example.py>`_
+  launches a CUDA kernel with PyTorch tensors and a wrapped PyTorch stream.
+
+Multi-device and advanced launch configuration
+----------------------------------------------
+
+- `simple_multi_gpu_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/simple_multi_gpu_example.py>`_
+  compiles and launches kernels across multiple GPUs.
+- `thread_block_cluster.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/thread_block_cluster.py>`_
+  demonstrates thread block cluster launch configuration on Hopper-class GPUs.
+- `tma_tensor_map.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/tma_tensor_map.py>`_
+  demonstrates Tensor Memory Accelerator descriptors and TMA-based bulk copies.
+
+Linking and graphs
+------------------
+
+- `jit_lto_fractal.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/jit_lto_fractal.py>`_
+  uses JIT link-time optimization to link user-provided device code into a
+  fractal workflow at runtime.
+- `cuda_graphs.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/cuda_graphs.py>`_
+  captures and replays a multi-kernel CUDA graph to reduce launch overhead.
+
+Interoperability and memory access
+----------------------------------
+
+- `memory_ops.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/memory_ops.py>`_
+  covers memory resources, pinned memory, device transfers, and DLPack interop.
+- `strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_cpu.py>`_
+  uses ``StridedMemoryView`` with JIT-compiled CPU code via ``cffi``.
+- `strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_gpu.py>`_
+  uses ``StridedMemoryView`` with JIT-compiled GPU code and foreign GPU buffers.
+- `gl_interop_plasma.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/gl_interop_plasma.py>`_
+  renders a CUDA-generated plasma effect through OpenGL interop without CPU
+  copies.
+
+System inspection
+-----------------
+
+- `show_device_properties.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/show_device_properties.py>`_
+  prints a detailed report of the CUDA devices available on the system.
diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst
@@ -32,7 +32,9 @@ Example: Compiling and Launching a CUDA kernel
 ----------------------------------------------
 
 To get a taste for ``cuda.core``, let's walk through a simple example that compiles and launches a vector addition kernel.
-You can find the complete example in `vector_add.py <https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples/vector_add.py>`_.
+You can find the complete example in `vector_add.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/vector_add.py>`_
+and browse the :doc:`examples page <examples>` for the rest of the shipped
+workflows.
 
 First, we define a string containing the CUDA C++ kernel. Note that this is a templated kernel:
 
@@ -76,8 +78,10 @@ Note the use of the ``name_expressions`` parameter to the :meth:`Program.compile
    mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration.
-We're using `CuPy <https://cupy.dev/>`_ arrays as inputs for this example, but you can use PyTorch tensors too
-(we show how to do this in one of our `examples <https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples>`_).
+We're using `CuPy <https://cupy.dev/>`_ arrays as inputs for this example, but
+you can use PyTorch tensors too (see
+`pytorch_example.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/pytorch_example.py>`_
+and the :doc:`examples page <examples>`).
 
 .. code-block:: python
 
@@ -108,7 +112,9 @@ Note the clean, Pythonic interface, and absence of any direct calls to the CUDA
 Examples and Recipes
 --------------------
 
-As we mentioned before, ``cuda.core`` can do much more than just compile and launch kernels.
+As we mentioned before, ``cuda.core`` can do much more than just compile and
+launch kernels.
 
-The best way to explore and learn the different features ``cuda.core`` is through
-our `examples <https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples>`_. Find one that matches your use-case, and modify it to fit your needs!
+Browse the :doc:`examples page <examples>` for direct links to every shipped
+example, including multi-GPU workflows, CUDA graphs, memory utilities, and
+interop-focused recipes.
diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst
@@ -11,6 +11,7 @@ Welcome to the documentation for ``cuda.core``.
    :caption: Contents:
 
    getting-started
+   examples
    install
    interoperability
    api

diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
@@ -66,18 +66,19 @@ designs gearing toward *stream-ordered* operations so as to avoid unnecessary sy
 While the designs are robust, *implementing* such protocols can be tricky and often requires
 a few iterations to ensure correctness.
 
-``cuda.core`` offers a :func:`~utils.args_viewable_as_strided_memory` decorator for
-extracting the metadata (such as pointer address, shape, strides, and dtype) from any
-Python objects supporting either CAI or DLPack and returning a :class:`~utils.StridedMemoryView`
-object. See the
-`strided_memory_view_constructors.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_constructors.py>`_
+``cuda.core`` offers a :func:`~utils.args_viewable_as_strided_memory` decorator
+for extracting the metadata (such as pointer address, shape, strides, and
+dtype) from any Python objects supporting either CAI or DLPack and returning a
+:class:`~utils.StridedMemoryView` object. See the
+`strided_memory_view_constructors.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_constructors.py>`_
 example for the explicit constructors, or
-`strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_cpu.py>`_
+`strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_cpu.py>`_
 and
-`strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_gpu.py>`_
-for decorator-based workflows. This provides a *concrete implementation* to both protocols that is
-**array-library-agnostic**, so that all Python projects can just rely on this without either
-re-implementing (the consumer-side of) the protocols or tying to any particular array libraries.
+`strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/|cuda_core_github_ref|/cuda_core/examples/strided_memory_view_gpu.py>`_
+for decorator-based workflows. This provides a *concrete implementation* to
+both protocols that is **array-library-agnostic**, so that all Python projects
+can just rely on this without either re-implementing (the consumer-side of)
+the protocols or tying to any particular array libraries.
 
 The :attr:`~utils.StridedMemoryView.is_device_accessible` attribute can be used to check
 whether or not the underlying buffer can be accessed on GPU.