From b064b25a3ec2cb087fcc427fea6ef3b2ceeeeada Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 26 Nov 2024 15:23:53 +0800
Subject: [PATCH 01/10] Increase some CPU example timeouts

---
 warp/tests/test_examples.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/warp/tests/test_examples.py b/warp/tests/test_examples.py
index a53c523e..d8d1a53d 100644
--- a/warp/tests/test_examples.py
+++ b/warp/tests/test_examples.py
@@ -316,7 +316,7 @@ class TestSimExamples(unittest.TestCase):
     name="sim.example_cloth",
     devices=test_devices,
     test_options={"usd_required": True},
-    test_options_cpu={"num_frames": 10},
+    test_options_cpu={"num_frames": 10, "test_timeout": 600},
 )
 add_example_test(
     TestSimExamples, name="sim.example_granular", devices=test_devices, test_options_cpu={"num_frames": 10}
@@ -396,12 +396,14 @@ class TestFemDiffusionExamples(unittest.TestCase):
     name="fem.example_convection_diffusion",
     devices=test_devices,
     test_options={"resolution": 20, "headless": True},
+    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples,
     name="fem.example_burgers",
     devices=test_devices,
     test_options={"resolution": 20, "num_frames": 25, "degree": 1, "headless": True},
+    test_options_cpu={"test_timeout": 600},
 )
 add_example_test(
     TestFemExamples,

From 22e179a76f2a9543f9a8859dec5e0124b7cefc28 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 26 Nov 2024 15:24:08 +0800
Subject: [PATCH 02/10] Add missing tests to the default suite

---
 warp/tests/unittest_suites.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/warp/tests/unittest_suites.py b/warp/tests/unittest_suites.py
index c3fa9d3a..7c798384 100644
--- a/warp/tests/unittest_suites.py
+++ b/warp/tests/unittest_suites.py
@@ -99,8 +99,11 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
     from warp.tests.test_closest_point_edge_edge import TestClosestPointEdgeEdgeMethods
     from warp.tests.test_codegen import TestCodeGen
     from warp.tests.test_codegen_instancing import TestCodeGenInstancing
+    from warp.tests.test_collision import TestCollision
+    from warp.tests.test_coloring import TestColoring
     from warp.tests.test_compile_consts import TestConstants
     from warp.tests.test_conditional import TestConditional
+    from warp.tests.test_context import TestContext
     from warp.tests.test_copy import TestCopy
     from warp.tests.test_ctypes import TestCTypes
     from warp.tests.test_dense import TestDense
@@ -127,6 +130,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
     from warp.tests.test_import import TestImport
     from warp.tests.test_indexedarray import TestIndexedArray
     from warp.tests.test_intersect import TestIntersect
+    from warp.tests.test_iter import TestIter
     from warp.tests.test_jax import TestJax
     from warp.tests.test_large import TestLarge
     from warp.tests.test_launch import TestLaunch
@@ -174,6 +178,10 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
     from warp.tests.test_streams import TestStreams
     from warp.tests.test_struct import TestStruct
     from warp.tests.test_tape import TestTape
+    from warp.tests.test_tile import TestTile
+    from warp.tests.test_tile_mathdx import TestTileMathDx
+    from warp.tests.test_tile_reduce import TestTileReduce
+    from warp.tests.test_tile_shared_memory import TestTileSharedMemory
     from warp.tests.test_torch import TestTorch
     from warp.tests.test_transient_module import TestTransientModule
     from warp.tests.test_triangle_closest_point import TestTriangleClosestPoint
@@ -200,8 +208,11 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
         TestClosestPointEdgeEdgeMethods,
         TestCodeGen,
         TestCodeGenInstancing,
-        TestConstants,
+        TestCollision,
+        TestColoring,
         TestConditional,
+        TestConstants,
+        TestContext,
         TestCopy,
         TestCTypes,
         TestDense,
@@ -228,6 +239,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
         TestImport,
         TestIndexedArray,
         TestIntersect,
+        TestIter,
         TestJax,
         TestLarge,
         TestLaunch,
@@ -275,6 +287,10 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
         TestStreams,
         TestStruct,
         TestTape,
+        TestTile,
+        TestTileMathDx,
+        TestTileReduce,
+        TestTileSharedMemory,
         TestTorch,
         TestTransientModule,
         TestTriangleClosestPoint,

From 8f33f2b130fa80bd5ae531e0e169c167753acf4f Mon Sep 17 00:00:00 2001
From: Christopher Crouzet <christopher@crouzet.pm>
Date: Wed, 9 Oct 2024 10:00:33 +1300
Subject: [PATCH 03/10] Add introductory notebooks

---
 CHANGELOG.md                     |   1 +
 README.md                        |  25 +
 notebooks/core_01_basics.ipynb   | 797 +++++++++++++++++++++++++++++++
 notebooks/core_02_generics.ipynb | 403 ++++++++++++++++
 notebooks/core_03_points.ipynb   | 360 ++++++++++++++
 notebooks/core_04_meshes.ipynb   | 554 +++++++++++++++++++++
 notebooks/core_05_volumes.ipynb  | 283 +++++++++++
 pyproject.toml                   |   1 +
 8 files changed, 2424 insertions(+)
 create mode 100644 notebooks/core_01_basics.ipynb
 create mode 100644 notebooks/core_02_generics.ipynb
 create mode 100644 notebooks/core_03_points.ipynb
 create mode 100644 notebooks/core_04_meshes.ipynb
 create mode 100644 notebooks/core_05_volumes.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b18ba98..6f61ec69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@
   to set a uniform radius for the added particles.
 - Document `array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
 - Document time-to-compile tradeoffs when using vector component assignment statements in kernels.
+- Add introductory Jupyter notebooks.
 
 ### Changed
 
diff --git a/README.md b/README.md
index 9411f60c..c5aa77ef 100644
--- a/README.md
+++ b/README.md
@@ -297,6 +297,31 @@ python -m warp.tests
     </tbody>
 </table>
 
+## Running Notebooks
+
+A few notebooks are available in the [notebooks](./notebooks/) directory to provide an overview over the key features available in Warp.
+
+To run these notebooks, ``jupyterlab`` is required to be installed using:
+
+
+```text
+pip install jupyterlab
+```
+
+From there, opening the notebooks can be done with the following command:
+
+```text
+jupyter lab ./notebooks
+```
+
+- [Warp Core Tutorial: Basics](https://github.com/NVIDIA/warp/tree/main/warp/notebooks/core_01_basics.ipynb) <a target="_blank" href="https://colab.research.google.com/github/NVIDIA/warp/blob/main/notebooks/core_01_basics.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- [Warp Core Tutorial: Generics](https://github.com/NVIDIA/warp/tree/main/warp/notebooks/core_02_generics.ipynb) <a target="_blank" href="https://colab.research.google.com/github/NVIDIA/warp/blob/main/notebooks/notebooks/core_02_generics.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- [Warp Core Tutorial: Points](https://github.com/NVIDIA/warp/tree/main/warp/notebooks/core_03_points.ipynb) <a target="_blank" href="https://colab.research.google.com/github/NVIDIA/warp/blob/main/notebooks/notebooks/core_03_points.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- [Warp Core Tutorial: Meshes](https://github.com/NVIDIA/warp/tree/main/warp/notebooks/core_04_meshes.ipynb) <a target="_blank" href="https://colab.research.google.com/github/NVIDIA/warp/blob/main/notebooks/notebooks/core_04_meshes.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- [Warp Core Tutorial: Volumes](https://github.com/NVIDIA/warp/tree/main/warp/notebooks/core_05_volumes.ipynb) <a target="_blank" href="https://colab.research.google.com/github/NVIDIA/warp/blob/main/notebooks/notebooks/core_05_volumes.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+
+
+
 ## Building
 
 For developers who want to build the library themselves, the following tools are required:
diff --git a/notebooks/core_01_basics.ipynb b/notebooks/core_01_basics.ipynb
new file mode 100644
index 00000000..d4b44a8b
--- /dev/null
+++ b/notebooks/core_01_basics.ipynb
@@ -0,0 +1,797 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Warp Core Tutorial: Basics\n",
+    "\n",
+    "Warp is a Python framework for writing high-performance code. Warp takes regular Python functions and JIT compiles them to efficient kernel code that can run on the CPU or GPU.\n",
+    "\n",
+    "This notebook showcases the essential features and capabilities that form the foundation of programming with Warp.\n",
+    "\n",
+    "A more in-depth reference of the API can be found in the [official documentation](https://nvidia.github.io/warp/).\n",
+    "\n",
+    "Prerequisites:\n",
+    "\n",
+    "- Basic Python knowledge.\n",
+    "- Understanding of NumPy arrays."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install warp-lang"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import warp as wp\n",
+    "\n",
+    "wp.config.quiet = True\n",
+    "\n",
+    "# Explicitly initializing Warp is not necessary but\n",
+    "# we do it here to ensure everything is good to go.\n",
+    "wp.init()\n",
+    "\n",
+    "if not wp.get_cuda_device_count():\n",
+    "    print(\n",
+    "        \"Some snippets in this notebook assume the presence of \"\n",
+    "        \"a CUDA-compatible device and won't run correctly without one.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Types\n",
+    "\n",
+    "Warp offers a range of data types that covers the needs in common compute workflows."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Boolean\n",
+    "\n",
+    "The types `wp.bool` and `bool`, which are interchangeable, can be used to represent `True`/`False` values."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scalars\n",
+    "\n",
+    "Signed/unsigned integer and floating-point numbers with different widths are supported.\n",
+    "\n",
+    "<table>\n",
+    "    <tr>\n",
+    "        <th></th>\n",
+    "        <th>Integer</th>\n",
+    "        <th>Floating-Point</th>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>8-bit</td>\n",
+    "        <td>wp.[u]int8</td>\n",
+    "        <td></td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>16-bit</td>\n",
+    "        <td>wp.[u]int16</td>\n",
+    "        <td>wp.float16</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>32-bit</td>\n",
+    "        <td>wp.[u]int32</td>\n",
+    "        <td>wp.float32</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>64-bit</td>\n",
+    "        <td>wp.[u]int64</td>\n",
+    "        <td>wp.float64</td>\n",
+    "    </tr>\n",
+    "</table>\n",
+    "\n",
+    "Python's `int` and `float` types can also be used in place of `wp.int32` and `wp.float32`.\n",
+    "\n",
+    "Note that typing in Warp is strict, and no integer promotion is done under the hood, so types need to be explicitly matched for operations to succeed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Operation between 32-bit integers.\n",
+    "print(\"\\nx:\")\n",
+    "x = 123 + 234\n",
+    "print(x)\n",
+    "\n",
+    "# Operation between 32-bit floating-points.\n",
+    "print(\"\\ny:\")\n",
+    "y = 1.2 + 2.3\n",
+    "print(y)\n",
+    "\n",
+    "# Operation between 8-bit integers.\n",
+    "print(\"\\nz:\")\n",
+    "z = wp.int8(1) + wp.int8(2)\n",
+    "print(z)\n",
+    "\n",
+    "# Invalid operation, both integer types must match.\n",
+    "print(\"\\nw:\")\n",
+    "try:\n",
+    "    w = wp.int8(1) + wp.int16(2)\n",
+    "    print(w)\n",
+    "except Exception:\n",
+    "    print(\"invalid operation\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Linear Algebra\n",
+    "\n",
+    "Vector, matrix, and quaternion types are also provided with the most common combination of scalar types and sizes being predefined.\n",
+    "\n",
+    "<table>\n",
+    "    <tr>\n",
+    "        <th></th>\n",
+    "        <th colspan=\"4\">Integer</th>\n",
+    "        <th colspan=\"3\">Floating-Point</th>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <th></th>\n",
+    "        <th>8-bit</th>\n",
+    "        <th>16-bit</th>\n",
+    "        <th>32-bit</th>\n",
+    "        <th>64-bit</th>\n",
+    "        <th>16-bit</th>\n",
+    "        <th>32-bit</th>\n",
+    "        <th>64-bit</th>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>2D Vector</td>\n",
+    "        <td>wp.vec2</td>\n",
+    "        <td>wp.vec2</td>\n",
+    "        <td>wp.vec2</td>\n",
+    "        <td>wp.vec2</td>\n",
+    "        <td>wp.vec2h</td>\n",
+    "        <td>wp.vec2f</td>\n",
+    "        <td>wp.vec2d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>3D Vector</td>\n",
+    "        <td>wp.vec3</td>\n",
+    "        <td>wp.vec3</td>\n",
+    "        <td>wp.vec3</td>\n",
+    "        <td>wp.vec3</td>\n",
+    "        <td>wp.vec3h</td>\n",
+    "        <td>wp.vec3f</td>\n",
+    "        <td>wp.vec3d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>4D Vector</td>\n",
+    "        <td>wp.vec4</td>\n",
+    "        <td>wp.vec4</td>\n",
+    "        <td>wp.vec4</td>\n",
+    "        <td>wp.vec4</td>\n",
+    "        <td>wp.vec4h</td>\n",
+    "        <td>wp.vec4f</td>\n",
+    "        <td>wp.vec4d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>2x2 Matrix</td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td>wp.mat22h</td>\n",
+    "        <td>wp.mat22f</td>\n",
+    "        <td>wp.mat22d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>3x3 Matrix</td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td>wp.mat33h</td>\n",
+    "        <td>wp.mat33f</td>\n",
+    "        <td>wp.mat33d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>4x4 Matrix</td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td>wp.mat44h</td>\n",
+    "        <td>wp.mat44f</td>\n",
+    "        <td>wp.mat44d</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>Quaternion</td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td>wp.quath</td>\n",
+    "        <td>wp.quatf</td>\n",
+    "        <td>wp.quatd</td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td>Transformation</td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td></td>\n",
+    "        <td>wp.transformh</td>\n",
+    "        <td>wp.transformf</td>\n",
+    "        <td>wp.transformd</td>\n",
+    "    </tr>\n",
+    "</table>\n",
+    "\n",
+    "The transformation types, as defined by Warp, define a translation part `pos` and a rotation `rot`, and is primarily intended to be used in the context of rigid bodies.\n",
+    "\n",
+    "A few aliases defaulting to 32-bit floating-points are also available as `wp.vec2`, `wp.vec3`, `wp.vec4`, `wp.mat22`, `wp.mat33`, `wp.mat44`, `wp.quat`, and `wp.transform`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rotate and scale a position vector.\n",
+    "print(\"\\nnew_pos:\")\n",
+    "pos = wp.vec3(1.0, 2.0, 3.0)\n",
+    "rot = wp.mat33(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)\n",
+    "scale = 0.5\n",
+    "new_pos = (pos * rot) * scale\n",
+    "print(new_pos)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Custom Linear Algebra Types\n",
+    "\n",
+    "It is possible to create linear algebra types of other sizes using the functions `wp.vec(length, dtype)` and `wp.mat(shape, dtype)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a 5D vector of 32-bit floating-points.\n",
+    "print(\"\\nv:\")\n",
+    "vec5f = wp.vec(length=5, dtype=wp.float32)\n",
+    "v = vec5f(1.0, 2.0, 3.0, 4.0, 5.0)\n",
+    "print(v)\n",
+    "\n",
+    "# Create a 2x3 matrix of 32-bit floating-points.\n",
+    "print(\"\\nm:\")\n",
+    "mat23f = wp.mat(shape=(2, 3), dtype=wp.float32)\n",
+    "m = mat23f(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)\n",
+    "print(m)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Arrays\n",
+    "\n",
+    "Arrays are multidimensional containers of fixed size that can store homogeneous elements of any Warp data type either on CPU or GPU memory.\n",
+    "\n",
+    "They are designed to seamlessly interop with arrays from other frameworks, such as [NumPy](https://numpy.org), [PyTorch](https://pytorch.org), [JAX](https://jax.readthedocs.io), and others.\n",
+    "\n",
+    "A gotcha due to supporting both CPU and GPU data within a unified interface, is that accessing individual elements directly from Python's runtime isn't exposed since this would otherwise encourage suboptimal performance, as explained in this [FAQ entry](https://nvidia.github.io/warp/faq.html#why-aren-t-assignments-to-warp-arrays-supported-outside-of-kernels)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Arrays can be initialized from multidimensional sequences of scalar data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a 1D array of integers.\n",
+    "print(\"\\narr_int:\")\n",
+    "arr_int = wp.array((1, 2, 3), dtype=int)\n",
+    "print(f\"dtype={arr_int.dtype}, shape={arr_int.shape}\")\n",
+    "print(arr_int)\n",
+    "\n",
+    "# Create a 1D array of vectors.\n",
+    "print(\"\\narr_vec:\")\n",
+    "arr_vec = wp.array(((1, 2, 3), (4, 5, 6)), dtype=wp.vec3)\n",
+    "print(f\"dtype={arr_vec.dtype}, shape={arr_vec.shape}\")\n",
+    "print(arr_vec)\n",
+    "\n",
+    "# Create a 2D array of floating-points.\n",
+    "print(\"\\narr_2d:\")\n",
+    "arr_2d = wp.array(((1, 2, 3), (4, 5, 6)), dtype=float)\n",
+    "print(f\"dtype={arr_2d.dtype}, shape={arr_2d.shape}\")\n",
+    "print(arr_2d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A few utilities allow to initialize arrays with a given value, or to skip initialization altogether."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create an array filled with zeros.\n",
+    "print(\"\\narr_zeros:\")\n",
+    "arr_zeros = wp.zeros(3)\n",
+    "print(f\"dtype={arr_zeros.dtype}, shape={arr_zeros.shape}\")\n",
+    "print(arr_zeros)\n",
+    "\n",
+    "# Create an array filled with ones.\n",
+    "print(\"\\narr_ones:\")\n",
+    "arr_ones = wp.ones(3)\n",
+    "print(f\"dtype={arr_ones.dtype}, shape={arr_ones.shape}\")\n",
+    "print(arr_ones)\n",
+    "\n",
+    "# Create an uninitialized array.\n",
+    "print(\"\\narr_empty:\")\n",
+    "arr_empty = wp.empty(3)\n",
+    "print(f\"dtype={arr_empty.dtype}, shape={arr_empty.shape}\")\n",
+    "print(arr_empty)\n",
+    "\n",
+    "# Create an array filled with a custom value.\n",
+    "print(\"\\narr_custom:\")\n",
+    "arr_custom = wp.full(3, 123)\n",
+    "print(f\"dtype={arr_custom.dtype}, shape={arr_custom.shape}\")\n",
+    "print(arr_custom)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initializing arrays from NumPy objects, or other frameworks like Torch, is also supported."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize an array from NumPy.\n",
+    "print(\"\\narr_from_np:\")\n",
+    "rng = np.random.default_rng(seed=123)\n",
+    "arr_np = rng.standard_normal((4, 2)).astype(np.float16)\n",
+    "arr_from_np = wp.from_numpy(arr_np)\n",
+    "print(f\"dtype={arr_from_np.dtype}, {arr_from_np.shape}\")\n",
+    "print(arr_from_np)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Structs\n",
+    "\n",
+    "When composite types are desired, it's possible to define Python classes decorated with `@wp.struct`, where each field is a class member that must be annotated with a Warp data type.\n",
+    "\n",
+    "Structs, like every other data types, are supported by arrays."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a new data type made of 2 fields.\n",
+    "@wp.struct\n",
+    "class Obstacle:\n",
+    "    pos: wp.vec3\n",
+    "    radius: float\n",
+    "\n",
+    "\n",
+    "# Create a first instance.\n",
+    "print(\"\\no1:\")\n",
+    "o1 = Obstacle()\n",
+    "o1.pos = wp.vec3(1.0, 2.0, 3.0)\n",
+    "o1.radius = 0.75\n",
+    "print(o1)\n",
+    "\n",
+    "# Create a second instance.\n",
+    "print(\"\\no2:\")\n",
+    "o2 = Obstacle()\n",
+    "o2.pos = wp.vec3(2.0, 3.0, 4.0)\n",
+    "o2.radius = 0.5\n",
+    "print(o2)\n",
+    "\n",
+    "# Create an array with these instances.\n",
+    "print(\"\\narr_struct:\")\n",
+    "arr_struct = wp.array((o1, o2), dtype=Obstacle)\n",
+    "print(f\"dtype={arr_struct.dtype}, shape={arr_struct.shape}\")\n",
+    "print(arr_struct)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Kernels\n",
+    "\n",
+    "In a typical Warp program, Python's runtime is used to allocate data and orchestrate operations, whereas the computationally intensive tasks are expected to be implemented as kernels.\n",
+    "\n",
+    "These kernels are functions decorated with `@wp.kernel`, however one notable difference with the usual Python functions is that they don't return values—all inputs and outputs must be defined as parameters with typed annotations, and all output parameters must be arrays.\n",
+    "\n",
+    "Passing data to these kernels and evaluating them on the desired device (CPU or GPU) is done with the `wp.launch()` function.\n",
+    "\n",
+    "Additionally, `wp.launch()` expects a `dim` argument that allows executing the same kernel many times in parallel, using threads, which is how the massively parallel architecture of modern GPUs and its associated performance boost can be leveraged.\n",
+    "\n",
+    "The `dim` argument expects either a single integer or a tuple with up to 4 values for multidimensional launches. To know which thread ID is currently being evaluated, we can call `wp.tid()` from within the kernel, which accordingly returns either a single value, or multiple ones."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that performs a component-wise average of two arrays.\n",
+    "@wp.kernel\n",
+    "def avg_kernel(\n",
+    "    a: wp.array(dtype=float),\n",
+    "    b: wp.array(dtype=float),\n",
+    "    out_avg: wp.array(dtype=float),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    out_avg[i] = (a[i] + b[i]) * 0.5\n",
+    "\n",
+    "\n",
+    "# Initialize the arrays to operate on and the output one storing their average.\n",
+    "shape = (32,)\n",
+    "rng = np.random.default_rng(seed=123)\n",
+    "a = wp.array(rng.standard_normal(shape).astype(np.float32))\n",
+    "b = wp.array(rng.standard_normal(shape).astype(np.float32))\n",
+    "out_avg = wp.empty_like(a)\n",
+    "\n",
+    "# Launch the kernel.\n",
+    "print(\"\\navg:\")\n",
+    "wp.launch(avg_kernel, dim=shape, inputs=(a, b), outputs=(out_avg,))\n",
+    "print(out_avg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Devices\n",
+    "\n",
+    "We mentioned earlier that arrays can live either on CPU or GPU memory and, similarly, that kernels can be evaluated on either device, but we didn't mention how to specify that.\n",
+    "\n",
+    "Arrays as well as many other functions from the API come with a `device` parameter that can either be left to the default value of `None`. or to a value representing the target device. When set to `None`, the default device currently set is used, otherwise `\"cpu\"`, and `\"cuda\"` can be set to pick either CPU or GPU memory. In the case of configurations with multiple GPUs, it's also possible to specify the device index, such as `\"cuda:0\"`, `\"cuda:1\"`, and so on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that fills an array with range values.\n",
+    "@wp.kernel\n",
+    "def range_fill_kernel(\n",
+    "    out: wp.array(dtype=int),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    out[i] = i\n",
+    "\n",
+    "\n",
+    "# Retrieve the current default device.\n",
+    "print(\"\\ncurrent_device:\")\n",
+    "current_device = wp.get_device()\n",
+    "print(current_device)\n",
+    "\n",
+    "# Fill an array on the current default device.\n",
+    "print(\"\\narr:\")\n",
+    "arr = wp.zeros(3, dtype=int)\n",
+    "wp.launch(range_fill_kernel, dim=arr.shape, outputs=(arr,))\n",
+    "print(f\"device={arr.device}\")\n",
+    "print(arr)\n",
+    "\n",
+    "# Fill an array on a specified device.\n",
+    "print(f\"\\narr_explicit:\")\n",
+    "device = \"cpu\"\n",
+    "arr_explicit = wp.zeros(3, dtype=int, device=device)\n",
+    "wp.launch(range_fill_kernel, dim=arr_explicit.shape, outputs=(arr_explicit,), device=device)\n",
+    "print(f\"device={arr_explicit.device}\")\n",
+    "print(arr_explicit)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In applications where all compute is intended to be run on a same device, it is recommended to not pass any `device` argument for individual API calls and, instead, wrap all code within a `wp.ScopedDevice()` context that sets the default device for all the API calls within that scope."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that fills an array with a fibonacci sequence.\n",
+    "@wp.kernel\n",
+    "def fibonacci_fill_kernel(\n",
+    "    out: wp.array(dtype=int),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    sqrt_5 = wp.sqrt(5.0)\n",
+    "    p = (1.0 + sqrt_5) / 2.0\n",
+    "    q = 1.0 / p\n",
+    "    out[i] = int((p ** float(i) + q ** float(i)) / sqrt_5 + 0.5)\n",
+    "\n",
+    "\n",
+    "# Ensure that all nested code is set to operate on a specified device.\n",
+    "device = \"cuda\"\n",
+    "with wp.ScopedDevice(device):\n",
+    "    print(f\"\\narr_scoped:\")\n",
+    "    arr_scoped = wp.zeros(8, dtype=int)\n",
+    "    wp.launch(fibonacci_fill_kernel, dim=arr_scoped.shape, outputs=(arr_scoped,))\n",
+    "    print(f\"device={arr_scoped.device}\")\n",
+    "    print(arr_scoped)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Transferring data between CPU and GPU memory is made easy across the API. For example, initializing an array on the GPU from an array on the CPU is handled seamlessly. More explicit functions are also exposed, such as `wp.copy()`, `wp.clone()`, or `wp.array.numpy()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clone a CPU array onto GPU memory.\n",
+    "print(\"\\narr_clone_gpu:\")\n",
+    "arr_clone_cpu = wp.array((1, 2, 3), dtype=int, device=\"cpu\")\n",
+    "arr_clone_gpu = wp.clone(arr_clone_cpu, device=\"cuda\")\n",
+    "print(f\"device={arr_clone_gpu.device}\")\n",
+    "print(arr_clone_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Built-Ins\n",
+    "\n",
+    "Similarly to Python's built-in functions, Warp ships with a set of functions that aims to cover the most common operations in areas such as scalar math (e.g.: `wp.min()`, `wp.abs()`, ...), vector math (e.g.: `wp.dot()`, `wp.length()`, ...), quaternion math (e.g.: `wp.quat_from_axis_angle()`, `wp.quat_rotate()`, ...), random numbers (e.g.: `wp.noise()`, `wp.sample_unit_sphere()`, ...), and others.\n",
+    "\n",
+    "Some math functions like `math.cos()` and `math.sin()` are available as part of Python's standard `math` module, however only their Warp counterpart such as `wp.cos()` and `wp.sin()` can be used within Warp kernels.\n",
+    "\n",
+    "All of these built-ins are available from kernels but, where possible, they can also be called directly from Python's runtime.\n",
+    "\n",
+    "The full list of built-ins is available in the documentation: https://nvidia.github.io/warp/modules/functions.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that computes the sine of each element from an array.\n",
+    "@wp.kernel\n",
+    "def sine_kernel(\n",
+    "    values: wp.array(dtype=float),\n",
+    "    out_sine: wp.array(dtype=float),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    out_sine[i] = wp.sin(values[i])\n",
+    "\n",
+    "\n",
+    "# Launch the sine kernel, once for each element.\n",
+    "print(\"\\nsine (kernel):\")\n",
+    "values = wp.array((1.0, 2.0, 3.0), dtype=float)\n",
+    "out_sine = wp.empty_like(values)\n",
+    "wp.launch(sine_kernel, dim=out_sine.shape, inputs=(values,), outputs=(out_sine,))\n",
+    "print(out_sine)\n",
+    "\n",
+    "# Try the same `wp.sin()` built-in from Python.\n",
+    "print(\"\\nsine (runtime):\")\n",
+    "x = wp.sin(1.0)\n",
+    "y = wp.sin(2.0)\n",
+    "z = wp.sin(3.0)\n",
+    "print(x, y, z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Random Numbers\n",
+    "\n",
+    "Random numbers is made available from within Warp kernels using the `wp.rand_init()` built-in to initialize the state of the generator, followed by any of `wp.randf()`, `wp.randi()`, or `wp.randn()` calls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that generates random numbers.\n",
+    "@wp.kernel\n",
+    "def rand_kernel(\n",
+    "    seed: int,\n",
+    "    out_rand: wp.array(dtype=float),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    rng = wp.rand_init(seed, i)\n",
+    "    out_rand[i] = wp.randf(rng)\n",
+    "\n",
+    "\n",
+    "# Launch the rand kernel.\n",
+    "print(\"\\nrand:\")\n",
+    "out_rand = wp.empty(3, dtype=float)\n",
+    "wp.launch(rand_kernel, dim=out_rand.shape, inputs=(123,), outputs=(out_rand,))\n",
+    "print(out_rand)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Geometric sampling is available through built-ins like `wp.sample_triangle()`, `wp.sample_unit_disk()`, `wp.sample_unit_sphere()`, `wp.sample_unit_cube()`, and others."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that samples random points within a unit hemisphere.\n",
+    "@wp.kernel\n",
+    "def sample_unit_hemisphere_kernel(\n",
+    "    seed: int,\n",
+    "    out_sample: wp.array(dtype=wp.vec3),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    rng = wp.rand_init(seed, i)\n",
+    "    out_sample[i] = wp.sample_unit_hemisphere(rng)\n",
+    "\n",
+    "\n",
+    "# Launch the rand kernel.\n",
+    "print(\"\\nsample:\")\n",
+    "out_sample = wp.empty(3, dtype=wp.vec3)\n",
+    "wp.launch(sample_unit_hemisphere_kernel, dim=out_sample.shape, inputs=(123,), outputs=(out_sample,))\n",
+    "print(out_sample)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, Perlin-based noise functions are exposed using `wp.noise()`, `wp.pnoise()`, and `wp.curlnoise()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that outputs a curl noise for a 2D value.\n",
+    "@wp.kernel\n",
+    "def noise_kernel(\n",
+    "    seed: int,\n",
+    "    out_noise: wp.array(dtype=wp.vec2),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    rng = wp.rand_init(seed, i)\n",
+    "    xy = wp.vec2(float(123 + i * 2), float(234 + i * 3))\n",
+    "    out_noise[i] = wp.curlnoise(rng, xy)\n",
+    "\n",
+    "\n",
+    "# Launch the noise kernel.\n",
+    "print(\"\\nnoise:\")\n",
+    "out_noise = wp.empty(3, dtype=wp.vec2)\n",
+    "wp.launch(noise_kernel, dim=out_noise.shape, inputs=(12,), outputs=(out_noise,))\n",
+    "print(out_noise)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## User Functions\n",
+    "\n",
+    "For a function to be available in kernels, it needs to be decorated with `@wp.func`. However, unlike with kernels, these functions cannot be passed to `wp.launch()` directly, instead they are meant to be called either by a kernel or by another Warp function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a function that computes the component-wise product of a 2D vector.\n",
+    "# Providing the return type hint is optional\n",
+    "@wp.func\n",
+    "def product(\n",
+    "    v: wp.vec2,\n",
+    ") -> float:\n",
+    "    return v[0] * v[1]\n",
+    "\n",
+    "\n",
+    "# Define a kernel that computes the component-wise product of 2 vectors.\n",
+    "@wp.kernel\n",
+    "def product_kernel(\n",
+    "    v1: wp.vec2,\n",
+    "    v2: wp.vec2,\n",
+    "    out_product: wp.array(dtype=float),\n",
+    "):\n",
+    "    out_product[0] = product(v1) * product(v2)\n",
+    "\n",
+    "\n",
+    "# Launch the product kernel once.\n",
+    "print(\"\\nproduct:\")\n",
+    "v1 = wp.vec2(2.0, 4.0)\n",
+    "v2 = wp.vec2(3.0, 5.0)\n",
+    "out_product = wp.empty(1, dtype=float)\n",
+    "wp.launch(product_kernel, dim=1, inputs=(v1, v2), outputs=(out_product,))\n",
+    "print(out_product)"
+   ]
+  }
+ ],
+ "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/core_02_generics.ipynb b/notebooks/core_02_generics.ipynb
new file mode 100644
index 00000000..b7020ca5
--- /dev/null
+++ b/notebooks/core_02_generics.ipynb
@@ -0,0 +1,403 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Warp Core Tutorial: Generics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install warp-lang"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warp as wp\n",
+    "\n",
+    "wp.config.quiet = True\n",
+    "\n",
+    "# Explicitly initializing Warp is not necessary but\n",
+    "# we do it here to ensure everything is good to go.\n",
+    "wp.init()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Function Overloading\n",
+    "\n",
+    "Warp allows defining multiple functions with the same name that have a different parameter signature."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.func\n",
+    "def product(\n",
+    "    v: wp.vec2,\n",
+    ") -> float:\n",
+    "    return v[0] * v[1]\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def product(\n",
+    "    m: wp.mat22,\n",
+    ") -> float:\n",
+    "    return m[0, 0] * m[0, 1] * m[1, 0] * m[1, 1]\n",
+    "\n",
+    "\n",
+    "# Define a kernel that computes the component-wise product\n",
+    "# of a vector and a matrix.\n",
+    "@wp.kernel\n",
+    "def product_kernel(\n",
+    "    v: wp.vec2,\n",
+    "    m: wp.mat22,\n",
+    "    out_product: wp.array(dtype=float),\n",
+    "):\n",
+    "    out_product[0] = product(v) * product(m)\n",
+    "\n",
+    "\n",
+    "print(\"\\nproduct:\")\n",
+    "v = wp.vec2(2.0, 4.0)\n",
+    "m = wp.mat22(3.0, 5.0, 7.0, 9.0)\n",
+    "out_product = wp.empty(1, dtype=float)\n",
+    "wp.launch(product_kernel, dim=1, inputs=(v, m), outputs=(out_product,))\n",
+    "print(out_product)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generic Functions\n",
+    "\n",
+    "A complementary approach to overloading functions is to use one of the generic types `typing.Any`, `wp.Int`, `wp.Float`, or `wp.Scalar`, and let Warp infer the final function's signature based on the arguments being passed to it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This function works with integer and floating-point types of any width.\n",
+    "@wp.func\n",
+    "def square(\n",
+    "    x: wp.Scalar,\n",
+    ") -> wp.Scalar:\n",
+    "    return x * x\n",
+    "\n",
+    "\n",
+    "# Define two kernels that square the values of an array,\n",
+    "# one for 16-bit integers, and another one for 64-bit floating-points.\n",
+    "@wp.kernel\n",
+    "def square_kernel_i16(\n",
+    "    arr: wp.array(dtype=wp.int16),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    arr[i] = square(arr[i])\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def square_kernel_f64(\n",
+    "    arr: wp.array(dtype=wp.float64),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    arr[i] = square(arr[i])\n",
+    "\n",
+    "\n",
+    "# First implicit kernel instantiation with a 16-bit integer type.\n",
+    "print(\"\\narr_i16:\")\n",
+    "arr_i16 = wp.array((1, 2, 3), dtype=wp.int16)\n",
+    "wp.launch(square_kernel_i16, dim=arr_i16.shape, inputs=(arr_i16,))\n",
+    "print(arr_i16)\n",
+    "\n",
+    "# Second implicit kernel instantiation with a 64-bit floating-point type.\n",
+    "print(\"\\narr_f64:\")\n",
+    "arr_f64 = wp.array((4, 5, 6), dtype=wp.float64)\n",
+    "wp.launch(square_kernel_f64, dim=arr_f64.shape, inputs=(arr_f64,))\n",
+    "print(arr_f64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generic Kernels\n",
+    "\n",
+    "The same generic types `typing.Any`, `wp.Int`, `wp.Float`, and `wp.Scalar` can also be used when annotating parameters on a kernel.\n",
+    "\n",
+    "To generate the final kernels from such generic types, Warp supports implicit and explicit instantiations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Implicit Instantiation\n",
+    "\n",
+    "By default, Warp infers the final kernel's signature and implementation based on the arguments being passed to it when calling `wp.launch()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that scales the values of an array with a coefficient.\n",
+    "# Its elements can be integers or floating-points of any width.\n",
+    "@wp.kernel\n",
+    "def scale_kernel(\n",
+    "    arr: wp.array(dtype=wp.Scalar),\n",
+    "    coeff: wp.Scalar,\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    arr[i] *= coeff\n",
+    "\n",
+    "\n",
+    "# First implicit kernel instantiation with a 16-bit integer type.\n",
+    "print(\"\\narr_i16:\")\n",
+    "arr_i16 = wp.array((1, 2, 3), dtype=wp.int16)\n",
+    "wp.launch(scale_kernel, dim=arr_i16.shape, inputs=(arr_i16, wp.int16(2)))\n",
+    "print(arr_i16)\n",
+    "\n",
+    "# Second implicit kernel instantiation with a 64-bit floating-point type.\n",
+    "print(\"\\narr_f64:\")\n",
+    "arr_f64 = wp.array((4, 5, 6), dtype=wp.float64)\n",
+    "wp.launch(scale_kernel, dim=arr_f64.shape, inputs=(arr_f64, wp.float64(2)))\n",
+    "print(arr_f64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Explicit Instantiation\n",
+    "\n",
+    "It's alse possible to specify which types a kernel should be instantiated against, before even needing to call `wp.launch()`. This is done using the `@wp.overload` decorator.\n",
+    "\n",
+    "One advantage of this approach is that it speeds up kernel launches since Warp won't need to try inferring and generating a new kernel instance each time. Another is related to module reloading, as detailed in the [documentation here](https://nvidia.github.io/warp/modules/generics.html#module-reloading-behavior)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that scales the values of an array with a coefficient.\n",
+    "# Its elements can be integers or floating-points of any width.\n",
+    "@wp.kernel\n",
+    "def scale_kernel(\n",
+    "    arr: wp.array(dtype=wp.Scalar),\n",
+    "    coeff: wp.Scalar,\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "    arr[i] *= coeff\n",
+    "\n",
+    "\n",
+    "# Explicit instantiation for 16-bit integers.\n",
+    "@wp.overload\n",
+    "def scale_kernel(\n",
+    "    arr: wp.array(dtype=wp.int16),\n",
+    "    coeff: wp.int16,\n",
+    "): ...\n",
+    "\n",
+    "\n",
+    "# Explicit instantiation for 64-bit floating-points.\n",
+    "@wp.overload\n",
+    "def scale_kernel(\n",
+    "    arr: wp.array(dtype=wp.float64),\n",
+    "    coeff: wp.float64,\n",
+    "): ...\n",
+    "\n",
+    "\n",
+    "# Launch the kernel instance using a 16-bit integer type.\n",
+    "print(\"\\narr_i16:\")\n",
+    "arr_i16 = wp.array((1, 2, 3), dtype=wp.int16)\n",
+    "wp.launch(scale_kernel, dim=arr_i16.shape, inputs=(arr_i16, wp.int16(2)))\n",
+    "print(arr_i16)\n",
+    "\n",
+    "# Launch the kernel instance using a 64-bit floating-point type.\n",
+    "print(\"\\narr_f64:\")\n",
+    "arr_f64 = wp.array((4, 5, 6), dtype=wp.float64)\n",
+    "wp.launch(scale_kernel, dim=arr_f64.shape, inputs=(arr_f64, wp.float64(2)))\n",
+    "print(arr_f64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Type Introspection\n",
+    "\n",
+    "Due to Warp's strict typing rules and lack of integer/floating-point promotion rules, it is required to pass the exact argument types when calling functions. For example, when constructing a `wp.vec3s()` instance, it is necessary to ensure that each argument is explicitly casted to the type `wp.int16`, if it isn't of that ype already, like `wp.vec3s(wp.int16(1), wp.int16(2), wp.int16(3))`, since integer literals default to 32-bit.\n",
+    "\n",
+    "In the context of a generic kernel/function where the parameter type is only known at runtime, Warp exposes a `type()` operator that allows retrieving the resolved type of a variable in order to initialize/cast values.\n",
+    "\n",
+    "To retrieve the data type of the elements of an array, calling `type()` on the first element can be used, but a more convenient form is also available with `array.dtype`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define a kernel that increases the values of an array by a fixed amount.\n",
+    "@wp.kernel\n",
+    "def increase_kernel(\n",
+    "    arr: wp.array(dtype=wp.Scalar),\n",
+    "):\n",
+    "    i = wp.tid()\n",
+    "\n",
+    "    # These 2 calls are equivalent.\n",
+    "    arr[i] += type(arr[0])(2)\n",
+    "    arr[i] += arr.dtype(3)\n",
+    "\n",
+    "\n",
+    "# Launch the kernel instance using a 16-bit integer type.\n",
+    "print(\"\\narr_i16:\")\n",
+    "arr_i16 = wp.array((1, 2, 3), dtype=wp.int16)\n",
+    "wp.launch(increase_kernel, dim=arr_i16.shape, inputs=(arr_i16,))\n",
+    "print(arr_i16)\n",
+    "\n",
+    "# Launch the kernel instance using a 64-bit floating-point type.\n",
+    "print(\"\\narr_f64:\")\n",
+    "arr_f64 = wp.array((4, 5, 6), dtype=wp.float64)\n",
+    "wp.launch(increase_kernel, dim=arr_f64.shape, inputs=(arr_f64,))\n",
+    "print(arr_f64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dynamic Code Generation\n",
+    "\n",
+    "When more flexibility is desired than what the approaches covered so far can offer, we can make use of the dynamic nature of Python to generate kernels, functions, and even structs at runtime using closures that define values, types, or even functions as parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define some operator functions that we can pass to the kernel as arguments.\n",
+    "\n",
+    "@wp.func\n",
+    "def op_add(\n",
+    "    a: wp.Scalar,\n",
+    "    b: wp.Scalar,\n",
+    ") -> wp.Scalar:\n",
+    "    return a + b\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def op_mul(\n",
+    "    a: wp.Scalar,\n",
+    "    b: wp.Scalar,\n",
+    ") -> wp.Scalar:\n",
+    "    return a * b\n",
+    "\n",
+    "\n",
+    "# Closure creating and returning a kernel.\n",
+    "# All the argument values will be embedded into the generated code\n",
+    "# that is to be compiled against the target architecture (CUDA or C++).\n",
+    "def create_kernel(\n",
+    "    vec_length: int,\n",
+    "    vec_dtype: wp.Scalar,\n",
+    "    num_iter: int,\n",
+    "    op_fn: wp.Function,\n",
+    ") -> wp.kernel:\n",
+    "    # Define the vector type from its length/dtype.\n",
+    "    vec = wp.vec(vec_length, vec_dtype)\n",
+    "\n",
+    "    # Define a function that reduces all of a vector's components into a single\n",
+    "    # value, using the provided operator function.\n",
+    "    @wp.func\n",
+    "    def reduce(\n",
+    "        v: vec,\n",
+    "    ) -> vec_dtype:\n",
+    "        out = vec_dtype(0)\n",
+    "        for i in range(vec_length):\n",
+    "            out += op_fn(v[i], vec_dtype(i))\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "    # Define the kernel function to return.\n",
+    "    @wp.kernel\n",
+    "    def kernel(\n",
+    "        arr: wp.array(dtype=vec),\n",
+    "    ):\n",
+    "        tid = wp.tid()\n",
+    "\n",
+    "        v = vec()\n",
+    "        for i in range(vec_length):\n",
+    "            v[i] = vec_dtype(tid + i)\n",
+    "\n",
+    "        for _ in range(num_iter):\n",
+    "            v *= reduce(v)\n",
+    "\n",
+    "        arr[tid] = v\n",
+    "\n",
+    "    return kernel\n",
+    "\n",
+    "\n",
+    "# Generate and evaluate a first kernel.\n",
+    "print(\"\\narr_1:\")\n",
+    "vec_length = 2\n",
+    "vec_dtype = wp.int32\n",
+    "num_iter = 3\n",
+    "op_fn = op_mul\n",
+    "arr_1 = wp.empty(3, dtype=wp.vec(vec_length, vec_dtype))\n",
+    "kernel_1 = create_kernel(vec_length, vec_dtype, num_iter, op_fn)\n",
+    "wp.launch(kernel_1, dim=arr_1.shape, inputs=(arr_1,))\n",
+    "print(arr_1)\n",
+    "\n",
+    "# Generate and evaluate a second kernel.\n",
+    "print(\"\\narr_2:\")\n",
+    "vec_length = 3\n",
+    "vec_dtype = wp.float64\n",
+    "num_iter = 2\n",
+    "op_fn = op_add\n",
+    "arr_2 = wp.empty(3, dtype=wp.vec(vec_length, vec_dtype))\n",
+    "kernel_2 = create_kernel(vec_length, vec_dtype, num_iter, op_fn)\n",
+    "wp.launch(kernel_2, dim=arr_2.shape, inputs=(arr_2,))\n",
+    "print(arr_2)"
+   ]
+  }
+ ],
+ "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/core_03_points.ipynb b/notebooks/core_03_points.ipynb
new file mode 100644
index 00000000..d1ba186a
--- /dev/null
+++ b/notebooks/core_03_points.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Warp Core Tutorial: Points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install warp-lang matplotlib pyglet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import warp as wp\n",
+    "\n",
+    "wp.config.quiet = True\n",
+    "\n",
+    "# Explicitly initializing Warp is not necessary but\n",
+    "# we do it here to ensure everything is good to go.\n",
+    "wp.init()\n",
+    "\n",
+    "# Everything else is solely to visualize the results.\n",
+    "import IPython\n",
+    "import matplotlib\n",
+    "import matplotlib.animation\n",
+    "import matplotlib.pyplot\n",
+    "import pyglet\n",
+    "\n",
+    "import warp.render\n",
+    "\n",
+    "matplotlib.pyplot.rc(\"animation\", html=\"jshtml\")\n",
+    "pyglet.options[\"headless\"] = True\n",
+    "\n",
+    "import sys\n",
+    "if \"google.colab\" in sys.modules:\n",
+    "    print(\n",
+    "        \"Rendering OpenGL instances with `glDrawElementsInstanced` \"\n",
+    "        \"is *extremely* slow in Google Colab, so it may take a few minutes \"\n",
+    "        \"to render the snippets from this notebook.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Point Queries\n",
+    "\n",
+    "When dealing with collections of points, a common operation is to query the neighbouring points to a given position, which can quickly become expensive as the number of points grow.\n",
+    "\n",
+    "To speed-up these queries, Warp exposes a hash grid spatial structure that can be initialized using `wp.HashGrid()` and then rebuilt at each step based on the current point position with a call to the `wp.HashGrid.build()` method.\n",
+    "\n",
+    "Instances of `wp.HashGrid` can then be passed to kernels using a `wp.uint64` integer representing their unique IDs, and the built-in `wp.hash_grid_query()` can be called there to iterate over the closest points."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Compute\"\"\"\n",
+    "\n",
+    "@wp.func\n",
+    "def contact_force(\n",
+    "    n: wp.vec3,\n",
+    "    v: wp.vec3,\n",
+    "    c: float,\n",
+    "    k_n: float,\n",
+    "    k_d: float,\n",
+    "    k_f: float,\n",
+    "    k_mu: float,\n",
+    ") -> wp.vec3:\n",
+    "    vn = wp.dot(n, v)\n",
+    "    jn = c * k_n\n",
+    "    jd = min(vn, 0.0) * k_d\n",
+    "\n",
+    "    # contact force\n",
+    "    fn = jn + jd\n",
+    "\n",
+    "    # friction force\n",
+    "    vt = v - n * vn\n",
+    "    vs = wp.length(vt)\n",
+    "\n",
+    "    if vs > 0.0:\n",
+    "        vt = vt / vs\n",
+    "\n",
+    "    # Coulomb condition\n",
+    "    ft = wp.min(vs * k_f, k_mu * wp.abs(fn))\n",
+    "\n",
+    "    # total force\n",
+    "    return -n * fn - vt * ft\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def update(\n",
+    "    grid: wp.uint64,\n",
+    "    particle_x: wp.array(dtype=wp.vec3),\n",
+    "    particle_v: wp.array(dtype=wp.vec3),\n",
+    "    particle_f: wp.array(dtype=wp.vec3),\n",
+    "    radius: float,\n",
+    "    k_contact: float,\n",
+    "    k_damp: float,\n",
+    "    k_friction: float,\n",
+    "    k_mu: float,\n",
+    "):\n",
+    "    tid = wp.tid()\n",
+    "\n",
+    "    # order threads by cell\n",
+    "    i = wp.hash_grid_point_id(grid, tid)\n",
+    "\n",
+    "    x = particle_x[i]\n",
+    "    v = particle_v[i]\n",
+    "\n",
+    "    f = wp.vec3()\n",
+    "\n",
+    "    # ground contact\n",
+    "    n = wp.vec3(0.0, 1.0, 0.0)\n",
+    "    c = wp.dot(n, x)\n",
+    "\n",
+    "    cohesion_ground = 0.02\n",
+    "    cohesion_particle = 0.0075\n",
+    "\n",
+    "    if c < cohesion_ground:\n",
+    "        f = f + contact_force(n, v, c, k_contact, k_damp, 100.0, 0.5)\n",
+    "\n",
+    "    # particle contact\n",
+    "    neighbors = wp.hash_grid_query(grid, x, radius * 5.0)\n",
+    "\n",
+    "    for index in neighbors:\n",
+    "        if index != i:\n",
+    "            # compute distance to point\n",
+    "            n = x - particle_x[index]\n",
+    "            d = wp.length(n)\n",
+    "            err = d - radius * 2.0\n",
+    "\n",
+    "            if err <= cohesion_particle:\n",
+    "                n = n / d\n",
+    "                vrel = v - particle_v[index]\n",
+    "\n",
+    "                f = f + contact_force(n, vrel, err, k_contact, k_damp, k_friction, k_mu)\n",
+    "\n",
+    "    particle_f[i] = f\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def integrate(\n",
+    "    x: wp.array(dtype=wp.vec3),\n",
+    "    v: wp.array(dtype=wp.vec3),\n",
+    "    f: wp.array(dtype=wp.vec3),\n",
+    "    gravity: wp.vec3,\n",
+    "    dt: float,\n",
+    "    inv_mass: float,\n",
+    "):\n",
+    "    tid = wp.tid()\n",
+    "\n",
+    "    v_new = v[tid] + f[tid] * inv_mass * dt + gravity * dt\n",
+    "    x_new = x[tid] + v_new * dt\n",
+    "\n",
+    "    v[tid] = v_new\n",
+    "    x[tid] = x_new\n",
+    "\n",
+    "\n",
+    "\"\"\"Initialization\"\"\"\n",
+    "\n",
+    "def create_particle_grid(\n",
+    "    dim_x: int,\n",
+    "    dim_y: int,\n",
+    "    dim_z: int,\n",
+    "    lower: float,\n",
+    "    radius: float,\n",
+    "    jitter: float,\n",
+    ") -> wp.array(dtype=wp.vec3):\n",
+    "    points = np.meshgrid(\n",
+    "        np.linspace(0, dim_x, dim_x),\n",
+    "        np.linspace(0, dim_y, dim_y),\n",
+    "        np.linspace(0, dim_z, dim_z),\n",
+    "    )\n",
+    "    points = np.array((points[0], points[1], points[2])).T * radius * 2.0 + np.array(lower)\n",
+    "    points = points + np.random.rand(*points.shape) * radius * jitter\n",
+    "    return wp.array(points.reshape((-1, 3)), dtype=wp.vec3)\n",
+    "\n",
+    "\n",
+    "# Resolution of the rendered image.\n",
+    "resolution = (512, 384)\n",
+    "\n",
+    "# Number of frames to run the sample for.\n",
+    "num_frames = 200\n",
+    "\n",
+    "# Number of frames per second.\n",
+    "fps = 60\n",
+    "\n",
+    "# Number of simulation step per frame.\n",
+    "sim_substeps = 64\n",
+    "\n",
+    "# Time delta between two frames.\n",
+    "frame_dt = 1.0 / fps\n",
+    "\n",
+    "# Time delta between two simulation steps.\n",
+    "sim_dt = frame_dt / sim_substeps\n",
+    "\n",
+    "# Radius for each particle.\n",
+    "point_radius = 0.1\n",
+    "\n",
+    "# Inverted mass for each particle.\n",
+    "inv_mass = 64.0\n",
+    "\n",
+    "# Contact friction stiffness.\n",
+    "k_contact = 8e3\n",
+    "\n",
+    "# Contact damping stiffness.\n",
+    "k_damp = 2.0\n",
+    "\n",
+    "# Contact friction stiffness.\n",
+    "k_friction = 1.0\n",
+    "\n",
+    "# Coefficient of friction for cohesive materials.\n",
+    "k_mu = 1e5\n",
+    "\n",
+    "# Initialize the particle positions, velocities, and forces.\n",
+    "points = create_particle_grid(8, 32, 8, (0.0, 0.5, 0.0), point_radius, 0.1)\n",
+    "velocities = wp.array(((0.0, 0.0, 15.0),) * len(points), dtype=wp.vec3)\n",
+    "forces = wp.empty_like(points)\n",
+    "\n",
+    "# Initialize a hash grid.\n",
+    "grid = wp.HashGrid(128, 128, 128)\n",
+    "\n",
+    "# Cell size to use when building the grid.\n",
+    "grid_cell_size = point_radius * 5.0\n",
+    "\n",
+    "# Camera settings.\n",
+    "camera_pos = (-26.0, 6.0, 13.5)\n",
+    "camera_front = (1.0, 0.0, 0.0)\n",
+    "\n",
+    "# Create a headless OpenGL renderer for our scene.\n",
+    "renderer = warp.render.OpenGLRenderer(\n",
+    "    fps=fps,\n",
+    "    screen_width=resolution[0],\n",
+    "    screen_height=resolution[1],\n",
+    "    camera_pos=camera_pos,\n",
+    "    camera_front=camera_front,\n",
+    "    draw_grid=False,\n",
+    "    draw_axis=False,\n",
+    "    vsync=False,\n",
+    "    headless=True,\n",
+    ")\n",
+    "\n",
+    "# Buffer storing the pixels data to visualize the resulting 3D render.\n",
+    "image = wp.empty(shape=(resolution[1], resolution[0], 3), dtype=float)\n",
+    "\n",
+    "\n",
+    "\"\"\"Evaluation\"\"\"\n",
+    "\n",
+    "renders = []\n",
+    "for frame in range(num_frames):\n",
+    "    # Build the hash grid from the current point positions.\n",
+    "    grid.build(points, grid_cell_size)\n",
+    "\n",
+    "    for _ in range(sim_substeps):\n",
+    "        # Update the forces of each point.\n",
+    "        wp.launch(\n",
+    "            kernel=update,\n",
+    "            dim=points.shape,\n",
+    "            inputs=(\n",
+    "                grid.id,\n",
+    "                points,\n",
+    "                velocities,\n",
+    "                forces,\n",
+    "                point_radius,\n",
+    "                k_contact,\n",
+    "                k_damp,\n",
+    "                k_friction,\n",
+    "                k_mu,\n",
+    "            ),\n",
+    "        )\n",
+    "\n",
+    "        # Solve the point velocities and positions for the current substep.\n",
+    "        wp.launch(\n",
+    "            kernel=integrate,\n",
+    "            dim=points.shape,\n",
+    "            inputs=(\n",
+    "                points,\n",
+    "                velocities,\n",
+    "                forces,\n",
+    "                (0.0, -9.8, 0.0),\n",
+    "                sim_dt,\n",
+    "                inv_mass,\n",
+    "            ),\n",
+    "        )\n",
+    "\n",
+    "    # Use the OpenGL renderer to store an image representing the 3D scene at\n",
+    "    # the current frame.\n",
+    "    renderer.begin_frame(frame / num_frames)\n",
+    "    renderer.render_points(\n",
+    "        points=points.numpy(),\n",
+    "        radius=point_radius,\n",
+    "        name=\"points\",\n",
+    "        colors=(0.8, 0.3, 0.2),\n",
+    "    )\n",
+    "    renderer.end_frame()\n",
+    "\n",
+    "    # Store the resulting render on host memory.\n",
+    "    renderer.get_pixels(image, split_up_tiles=False, mode=\"rgb\")\n",
+    "    renders.append(wp.clone(image, device=\"cpu\", pinned=True))\n",
+    "\n",
+    "# Ensure that all the kernel launches and copies to CPU have finished.\n",
+    "wp.synchronize()\n",
+    "\n",
+    "\n",
+    "\"\"\"Visualization in Matplotlib\"\"\"\n",
+    "\n",
+    "# Set-up Matplotlib.\n",
+    "plot_fig = matplotlib.pyplot.figure(figsize=resolution, dpi=1.0)\n",
+    "plot_fig.subplots_adjust(left=0, bottom=0, right=1, top=1)\n",
+    "plot_img = matplotlib.pyplot.imshow(renders[0], animated=True)\n",
+    "plot_img.axes.set_axis_off()\n",
+    "\n",
+    "# Run Matplotlib's animation.\n",
+    "plot_anim = matplotlib.animation.FuncAnimation(\n",
+    "    plot_fig,\n",
+    "    lambda frame: plot_img.set_data(renders[frame]),\n",
+    "    frames=num_frames,\n",
+    "    interval=(1.0 / fps) * 1000.0,\n",
+    ")\n",
+    "\n",
+    "# Display the result.\n",
+    "IPython.display.display(plot_anim)\n",
+    "matplotlib.pyplot.close()"
+   ]
+  }
+ ],
+ "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/core_04_meshes.ipynb b/notebooks/core_04_meshes.ipynb
new file mode 100644
index 00000000..6469136f
--- /dev/null
+++ b/notebooks/core_04_meshes.ipynb
@@ -0,0 +1,554 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Warp Core Tutorial: Meshes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install warp-lang matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warp as wp\n",
+    "\n",
+    "wp.config.quiet = True\n",
+    "\n",
+    "# Explicitly initializing Warp is not necessary but\n",
+    "# we do it here to ensure everything is good to go.\n",
+    "wp.init()\n",
+    "\n",
+    "# Everything else is solely to visualize the results.\n",
+    "import IPython\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot\n",
+    "\n",
+    "matplotlib.pyplot.rc(\"animation\", html=\"jshtml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mesh Queries\n",
+    "\n",
+    "Triangular meshes with point positions, vertex indices, and optionally velocities, can be represented in Warp with the `wp.Mesh` structure.\n",
+    "\n",
+    "The main purpose of this structure is to offer facilities related to querying meshes using a fast BVH acceleration structure, which is well-suited for collision detection amongst others."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A built-in commonly used for collision detection, or to create Signed Distance Fields (SDFs), is `wp.mesh_query_point()`, which allows to gather information about the closest location on a mesh from a given point in space.\n",
+    "\n",
+    "The data returned contains the distance of the point to that mesh location, whether it is inside or outside the mesh, as well as the primitive index encompassing that location with its UV coordinates, which allows evaluating these into a 3D position."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Triangular mesh data for a logo geometry\"\"\"\n",
+    "\n",
+    "GEOM_POINTS = (\n",
+    "    (-3.2729452  , -0.5, -0.47547257 ), (-3.0449789  , -0.5,  0.02605331 ),\n",
+    "    (-2.667206   , -0.5,  0.644819   ), (-2.2112734  , -0.5,  1.1268051  ),\n",
+    "    (-1.6381009  , -0.5,  1.5110911  ), (-1.0128219  , -0.5,  1.7455708  ),\n",
+    "    (-0.40708268 , -0.5,  1.7976774  ), ( 0.2963562  , -0.5,  1.7455708  ),\n",
+    "    ( 0.9607153  , -0.5,  1.550171   ), ( 1.559941   , -0.5,  1.289638   ),\n",
+    "    ( 2.0875204  , -0.5,  0.9900252  ), ( 2.5695062  , -0.5,  0.62527907 ),\n",
+    "    ( 3.1622186  , -0.5,  0.13677979 ), ( 4.197837   , -0.5,  0.7360056  ),\n",
+    "    ( 3.4618316  , -0.5,  1.2375314  ), ( 2.1135736  , -0.5,  1.810704   ),\n",
+    "    ( 1.2473016  , -0.5,  2.032157   ), ( 0.32892287 , -0.5,  2.2210433  ),\n",
+    "    (-0.6871557  , -0.5,  2.2666366  ), (-1.3840814  , -0.5,  2.1559103  ),\n",
+    "    (-2.0875204  , -0.5,  1.8823506  ), (-2.719313   , -0.5,  1.4850378  ),\n",
+    "    (-3.292485   , -0.5,  0.9379186  ), (-3.6702578  , -0.5,  0.44290602 ),\n",
+    "    (-4.015464   , -0.5, -0.16934645 ), (-4.197837   , -0.5, -0.5666591  ),\n",
+    "    (-4.191324   , -0.5, -0.5796858  ), (-3.7614443  , -0.5, -0.93791854 ),\n",
+    "    (-3.227352   , -0.5, -1.3156914  ), (-2.5108862  , -0.5, -1.7585975  ),\n",
+    "    (-1.9051471  , -0.5, -2.012617   ), (-1.2407882  , -0.5, -2.1819634  ),\n",
+    "    (-0.6220224  , -0.5, -2.2666368  ), (-0.02279663 , -0.5, -2.2536101  ),\n",
+    "    ( 0.6089958  , -0.5, -2.0777502  ), ( 1.1496017  , -0.5, -1.849784   ),\n",
+    "    ( 1.6836942  , -0.5, -1.550171   ), ( 2.1266003  , -0.5, -1.1984516  ),\n",
+    "    ( 2.5108862  , -0.5, -0.8532454  ), ( 2.764906   , -0.5, -0.58619916 ),\n",
+    "    ( 1.8074474  , -0.5, -0.5406059  ), ( 1.436188   , -0.5, -0.95094526 ),\n",
+    "    ( 0.934662   , -0.5, -1.3612849  ), ( 0.452676   , -0.5, -1.6022776  ),\n",
+    "    (-0.042336583, -0.5, -1.7585975  ), (-0.58945584 , -0.5, -1.8237306  ),\n",
+    "    (-1.1886816  , -0.5, -1.7585975  ), (-1.9181738  , -0.5, -1.5436577  ),\n",
+    "    (-2.4783196  , -0.5, -1.2179915  ), (-2.9016855  , -0.5, -0.8923255  ),\n",
+    "    (-2.5785387  , -0.5, -0.41622198 ), (-2.0679803  , -0.5, -0.84021866 ),\n",
+    "    (-1.5664544  , -0.5, -1.0616719  ), (-1.0453885  , -0.5, -1.2310182  ),\n",
+    "    (-0.53083587 , -0.5, -1.2961515  ), (-0.009769917, -0.5, -1.2114781  ),\n",
+    "    ( 0.37451613 , -0.5, -1.0616719  ), ( 0.7392623  , -0.5, -0.8076521  ),\n",
+    "    ( 1.0063086  , -0.5, -0.5666591  ), ( 0.23773634 , -0.5,  0.110726595),\n",
+    "    (-0.10095644 , -0.5, -0.44941938 ), (-0.51780915 , -0.5, -0.7685722  ),\n",
+    "    (-1.0584152  , -0.5, -0.801139   ), (-1.5208611  , -0.5, -0.65133226 ),\n",
+    "    (-1.8335006  , -0.5, -0.34520614 ), (-2.1656802  , -0.5, 0.40382612  ),\n",
+    "    (-1.6771808  , -0.5, 0.905352    ), (-1.1235483  , -0.5, 1.2049649   ),\n",
+    "    (-0.5112959  , -0.5, 1.3222047   ), ( 0.16608977 , -0.5, 1.2505581   ),\n",
+    "    ( 0.7978822  , -0.5, 1.0160785   ), ( 1.397108   , -0.5, 0.6383058   ),\n",
+    "    ( 1.9312005  , -0.5, 0.27355957  ), ( 2.3415399  , -0.5, -0.11723983 ),\n",
+    "    ( 1.377568   , -0.5, -0.039079905), ( 0.7718289  , -0.5,  0.4754727  ),\n",
+    "    ( 0.15957642 , -0.5,  0.8011388  ), (-0.61550903 , -0.5,  0.8988387  ),\n",
+    "    (-1.2277615  , -0.5,  0.6187657  ), (-1.559941   , -0.5,  0.25401962 ),\n",
+    "    (-2.5785387  ,  0.5, -0.41622198 ), (-2.1656802  ,  0.5,  0.40382612 ),\n",
+    "    (-1.6771808  ,  0.5,  0.905352   ), (-1.1235483  ,  0.5,  1.2049649  ),\n",
+    "    (-0.5112959  ,  0.5,  1.3222047  ), ( 0.16608977 ,  0.5,  1.2505581  ),\n",
+    "    ( 0.7978822  ,  0.5,  1.0160785  ), ( 1.397108   ,  0.5,  0.6383058  ),\n",
+    "    ( 1.9312005  ,  0.5,  0.27355957 ), ( 2.3415399  ,  0.5, -0.11723983 ),\n",
+    "    ( 2.764906   ,  0.5, -0.58619916 ), ( 2.5108862  ,  0.5, -0.8532454  ),\n",
+    "    ( 2.1266003  ,  0.5, -1.1984516  ), ( 1.6836942  ,  0.5, -1.550171   ),\n",
+    "    ( 1.1496017  ,  0.5, -1.849784   ), ( 0.6089958  ,  0.5, -2.0777502  ),\n",
+    "    (-0.02279663 ,  0.5, -2.2536101  ), (-0.6220224  ,  0.5, -2.2666368  ),\n",
+    "    (-1.2407882  ,  0.5, -2.1819634  ), (-1.9051471  ,  0.5, -2.012617   ),\n",
+    "    (-2.5108862  ,  0.5, -1.7585975  ), (-3.227352   ,  0.5, -1.3156914  ),\n",
+    "    (-3.7614443  ,  0.5, -0.93791854 ), (-4.191324   ,  0.5, -0.5796858  ),\n",
+    "    (-4.197837   ,  0.5, -0.5666591  ), (-4.015464   ,  0.5, -0.16934645 ),\n",
+    "    (-3.6702578  ,  0.5,  0.44290602 ), (-3.292485   ,  0.5,  0.9379186  ),\n",
+    "    (-2.719313   ,  0.5,  1.4850378  ), (-2.0875204  ,  0.5,  1.8823506  ),\n",
+    "    (-1.3840814  ,  0.5,  2.1559103  ), (-0.6871557  ,  0.5,  2.2666366  ),\n",
+    "    ( 0.32892287 ,  0.5,  2.2210433  ), ( 1.2473016  ,  0.5,  2.032157   ),\n",
+    "    ( 2.1135736  ,  0.5,  1.810704   ), ( 3.4618316  ,  0.5,  1.2375314  ),\n",
+    "    ( 4.197837   ,  0.5,  0.7360056  ), ( 3.1622186  ,  0.5,  0.13677979 ),\n",
+    "    ( 2.5695062  ,  0.5,  0.62527907 ), ( 2.0875204  ,  0.5,  0.9900252  ),\n",
+    "    ( 1.559941   ,  0.5,  1.289638   ), ( 0.9607153  ,  0.5,  1.550171   ),\n",
+    "    ( 0.2963562  ,  0.5,  1.7455708  ), (-0.40708268 ,  0.5,  1.7976774  ),\n",
+    "    (-1.0128219  ,  0.5,  1.7455708  ), (-1.6381009  ,  0.5,  1.5110911  ),\n",
+    "    (-2.2112734  ,  0.5,  1.1268051  ), (-2.667206   ,  0.5,  0.644819   ),\n",
+    "    (-3.0449789  ,  0.5,  0.02605331 ), (-3.2729452  ,  0.5, -0.47547257 ),\n",
+    "    (-2.9016855  ,  0.5, -0.8923255  ), (-2.4783196  ,  0.5, -1.2179915  ),\n",
+    "    (-1.9181738  ,  0.5, -1.5436577  ), (-1.1886816  ,  0.5, -1.7585975  ),\n",
+    "    (-0.58945584 ,  0.5, -1.8237306  ), (-0.042336583,  0.5, -1.7585975  ),\n",
+    "    ( 0.452676   ,  0.5, -1.6022776  ), ( 0.934662   ,  0.5, -1.3612849  ),\n",
+    "    ( 1.436188   ,  0.5, -0.95094526 ), ( 1.8074474  ,  0.5, -0.5406059  ),\n",
+    "    ( 1.377568   ,  0.5, -0.039079905), ( 0.7718289  ,  0.5,  0.4754727  ),\n",
+    "    ( 0.15957642 ,  0.5,  0.8011388  ), (-0.61550903 ,  0.5,  0.8988387  ),\n",
+    "    (-1.2277615  ,  0.5,  0.6187657  ), (-1.559941   ,  0.5,  0.25401962 ),\n",
+    "    (-1.8335006  ,  0.5, -0.34520614 ), (-1.5208611  ,  0.5, -0.65133226 ),\n",
+    "    (-1.0584152  ,  0.5, -0.801139   ), (-0.51780915 ,  0.5, -0.7685722  ),\n",
+    "    (-0.10095644 ,  0.5, -0.44941938 ), ( 0.23773634 ,  0.5,  0.110726595),\n",
+    "    ( 1.0063086  ,  0.5, -0.5666591  ), ( 0.7392623  ,  0.5, -0.8076521  ),\n",
+    "    ( 0.37451613 ,  0.5, -1.0616719  ), (-0.009769917,  0.5, -1.2114781  ),\n",
+    "    (-0.53083587 ,  0.5, -1.2961515  ), (-1.0453885  ,  0.5, -1.2310182  ),\n",
+    "    (-1.5664544  ,  0.5, -1.0616719  ), (-2.0679803  ,  0.5, -0.84021866 ),\n",
+    ")\n",
+    "\n",
+    "GEOM_FACE_VERTEX_INDICES = (\n",
+    "    116, 117, 115, 117, 118, 115, 115, 118, 114, 118, 119, 114,\n",
+    "    119, 120, 114, 114, 120, 113, 120, 121, 113, 113, 121, 112,\n",
+    "    121, 122, 112, 151, 152, 150, 152, 153, 150, 153, 154, 150,\n",
+    "    154, 155, 150, 150, 155, 149, 155, 156, 149, 156, 157, 149,\n",
+    "    149, 157, 148, 157, 158, 148, 148, 158, 147, 158, 159, 147,\n",
+    "    147, 159, 146, 159,  80, 146,  80,  81, 146, 146,  81, 145,\n",
+    "     81,  82, 145, 145,  82, 144,  82,  83, 144, 144,  83, 143,\n",
+    "     83,  84, 143, 143,  84, 142,  84,  85, 142,  85,  86, 142,\n",
+    "    142,  86, 141,  86,  87, 141, 141,  87, 140,  87,  88, 140,\n",
+    "    140,  88, 139,  88,  89, 139, 112, 122, 111, 122, 123, 111,\n",
+    "    123, 124, 111, 111, 124, 110, 124, 125, 110, 110, 125, 109,\n",
+    "    125, 126, 109, 109, 126, 108, 126, 127, 108, 108, 127, 107,\n",
+    "    127, 128, 107, 107, 128, 106, 106, 128, 105, 128, 129, 105,\n",
+    "    105, 129, 104, 104, 129, 103, 103, 129, 102, 102, 129, 101,\n",
+    "    129, 130, 101, 101, 130, 100, 130, 131, 100, 131, 132, 100,\n",
+    "    100, 132,  99, 132, 133,  99,  99, 133,  98,  98, 133,  97,\n",
+    "    133, 134,  97,  97, 134,  96, 134, 135,  96,  96, 135,  95,\n",
+    "    135, 136,  95,  95, 136,  94, 136, 137,  94,  94, 137,  93,\n",
+    "    137, 138,  93,  93, 138,  92, 138, 139,  92,  92, 139,  91,\n",
+    "    139,  89,  91,  89,  90,  91,  13,  14,  12,  12,  14,  11,\n",
+    "     14,  15,  11,  11,  15,  10,  10,  15,   9,  15,  16,   9,\n",
+    "      9,  16,   8,  16,  17,   8,   8,  17,   7,  59,  60,  58,\n",
+    "     58,  60,  57,  57,  60,  56,  56,  60,  55,  60,  61,  55,\n",
+    "     55,  61,  54,  54,  61,  53,  61,  62,  53,  53,  62,  52,\n",
+    "     62,  63,  52,  52,  63,  51,  63,  64,  51,  51,  64,  50,\n",
+    "     50,  64,  65,  64,  79,  65,  65,  79,  66,  79,  78,  66,\n",
+    "     66,  78,  67,  78,  77,  67,  67,  77,  68,  77,  76,  68,\n",
+    "     68,  76,  69,  69,  76,  70,  76,  75,  70,  70,  75,  71,\n",
+    "     75,  74,  71,  71,  74,  72,  74,  40,  72,  72,  40,  73,\n",
+    "      7,  17,   6,  17,  18,   6,   6,  18,   5,  18,  19,   5,\n",
+    "      5,  19,   4,  19,  20,   4,   4,  20,   3,  20,  21,   3,\n",
+    "      3,  21,   2,  21,  22,   2,   2,  22,   1,  22,  23,   1,\n",
+    "     23,  24,   1,   1,  24,   0,  24,  25,   0,  25,  26,   0,\n",
+    "     26,  27,   0,  27,  28,   0,   0,  28,  49,  28,  29,  49,\n",
+    "     49,  29,  48,  48,  29,  47,  29,  30,  47,  47,  30,  46,\n",
+    "     30,  31,  46,  46,  31,  45,  31,  32,  45,  32,  33,  45,\n",
+    "     45,  33,  44,  33,  34,  44,  44,  34,  43,  34,  35,  43,\n",
+    "     43,  35,  42,  35,  36,  42,  42,  36,  41,  36,  37,  41,\n",
+    "     41,  37,  40,  37,  38,  40,  40,  38,  73,  38,  39,  73,\n",
+    "     50,  65,  80,  80,  65,  81,  65,  66,  81,  81,  66,  82,\n",
+    "     66,  67,  82,  82,  67,  83,  67,  68,  83,  83,  68,  84,\n",
+    "     68,  69,  84,  84,  69,  85,  69,  70,  85,  85,  70,  86,\n",
+    "     70,  71,  86,  86,  71,  87,  71,  72,  87,  87,  72,  88,\n",
+    "     72,  73,  88,  88,  73,  89,  73,  39,  89,  89,  39,  90,\n",
+    "     39,  38,  90,  90,  38,  91,  38,  37,  91,  91,  37,  92,\n",
+    "     37,  36,  92,  92,  36,  93,  36,  35,  93,  93,  35,  94,\n",
+    "     35,  34,  94,  94,  34,  95,  34,  33,  95,  95,  33,  96,\n",
+    "     33,  32,  96,  96,  32,  97,  32,  31,  97,  97,  31,  98,\n",
+    "     31,  30,  98,  98,  30,  99,  30,  29,  99,  99,  29, 100,\n",
+    "     29,  28, 100, 100,  28, 101,  28,  27, 101, 101,  27, 102,\n",
+    "     27,  26, 102, 102,  26, 103,  26,  25, 103, 103,  25, 104,\n",
+    "     25,  24, 104, 104,  24, 105,  24,  23, 105, 105,  23, 106,\n",
+    "     23,  22, 106, 106,  22, 107,  22,  21, 107, 107,  21, 108,\n",
+    "     21,  20, 108, 108,  20, 109,  20,  19, 109, 109,  19, 110,\n",
+    "     19,  18, 110, 110,  18, 111,  18,  17, 111, 111,  17, 112,\n",
+    "     17,  16, 112, 112,  16, 113,  16,  15, 113, 113,  15, 114,\n",
+    "     15,  14, 114, 114,  14, 115,  14,  13, 115, 115,  13, 116,\n",
+    "     13,  12, 116, 116,  12, 117,  12,  11, 117, 117,  11, 118,\n",
+    "     11,  10, 118, 118,  10, 119,  10,   9, 119, 119,   9, 120,\n",
+    "      9,   8, 120, 120,   8, 121,   8,   7, 121, 121,   7, 122,\n",
+    "      7,   6, 122, 122,   6, 123,   6,   5, 123, 123,   5, 124,\n",
+    "      5,   4, 124, 124,   4, 125,   4,   3, 125, 125,   3, 126,\n",
+    "      3,   2, 126, 126,   2, 127,   2,   1, 127, 127,   1, 128,\n",
+    "      1,   0, 128, 128,   0, 129,   0,  49, 129, 129,  49, 130,\n",
+    "     49,  48, 130, 130,  48, 131,  48,  47, 131, 131,  47, 132,\n",
+    "     47,  46, 132, 132,  46, 133,  46,  45, 133, 133,  45, 134,\n",
+    "     45,  44, 134, 134,  44, 135,  44,  43, 135, 135,  43, 136,\n",
+    "     43,  42, 136, 136,  42, 137,  42,  41, 137, 137,  41, 138,\n",
+    "     41,  40, 138, 138,  40, 139,  40,  74, 139, 139,  74, 140,\n",
+    "     74,  75, 140, 140,  75, 141,  75,  76, 141, 141,  76, 142,\n",
+    "     76,  77, 142, 142,  77, 143,  77,  78, 143, 143,  78, 144,\n",
+    "     78,  79, 144, 144,  79, 145,  79,  64, 145, 145,  64, 146,\n",
+    "     64,  63, 146, 146,  63, 147,  63,  62, 147, 147,  62, 148,\n",
+    "     62,  61, 148, 148,  61, 149,  61,  60, 149, 149,  60, 150,\n",
+    "     60,  59, 150, 150,  59, 151,  59,  58, 151, 151,  58, 152,\n",
+    "     58,  57, 152, 152,  57, 153,  57,  56, 153, 153,  56, 154,\n",
+    "     56,  55, 154, 154,  55, 155,  55,  54, 155, 155,  54, 156,\n",
+    "     54,  53, 156, 156,  53, 157,  53,  52, 157, 157,  52, 158,\n",
+    "     52,  51, 158, 158,  51, 159,  51,  50, 159, 159,  50,  80,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\"\"\"Compute\"\"\"\n",
+    "\n",
+    "@wp.func\n",
+    "def bourke_color_map(\n",
+    "    low: float,\n",
+    "    high: float,\n",
+    "    v: float,\n",
+    ") -> wp.vec3:\n",
+    "    r = 1.0\n",
+    "    g = 1.0\n",
+    "    b = 1.0\n",
+    "\n",
+    "    if v < low:\n",
+    "        v = low\n",
+    "    if v > high:\n",
+    "        v = high\n",
+    "\n",
+    "    dv = high - low\n",
+    "\n",
+    "    if v < (low + 0.25 * dv):\n",
+    "        r = 0.0\n",
+    "        g = 4.0 * (v - low) / dv\n",
+    "    elif v < (low + 0.5 * dv):\n",
+    "        r = 0.0\n",
+    "        b = 1.0 + 4.0 * (low + 0.25 * dv - v) / dv\n",
+    "    elif v < (low + 0.75 * dv):\n",
+    "        r = 4.0 * (v - low - 0.5 * dv) / dv\n",
+    "        b = 0.0\n",
+    "    else:\n",
+    "        g = 1.0 + 4.0 * (low + 0.75 * dv - v) / dv\n",
+    "        b = 0.0\n",
+    "\n",
+    "    return wp.vec3(r, g, b)\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def compute_sdf_slice(\n",
+    "    pixel_size: float,\n",
+    "    pixel_offset: wp.vec2,\n",
+    "    geom: wp.uint64,\n",
+    "    sdf_band_width: float,\n",
+    "    out_texture: wp.array(dtype=wp.vec3, ndim=2),\n",
+    "):\n",
+    "    # Retrieve the current pixel's 2D indices.\n",
+    "    j, i = wp.tid()\n",
+    "\n",
+    "    # Compute the position in 3D space of the pixel.\n",
+    "    point = wp.vec3(\n",
+    "        float(j) * pixel_size + pixel_offset[0],\n",
+    "        0.0,\n",
+    "        float(i) * pixel_size + pixel_offset[1],\n",
+    "    )\n",
+    "\n",
+    "    # Query the nearest location on the mesh using BVH.\n",
+    "    query = wp.mesh_query_point(geom, point, max_dist=1e6)\n",
+    "    if not query.result:\n",
+    "        return\n",
+    "\n",
+    "    # Evaluate the position of the nearest location found.\n",
+    "    nearest_pos = wp.mesh_eval_position(\n",
+    "        geom,\n",
+    "        query.face,\n",
+    "        query.u,\n",
+    "        query.v,\n",
+    "    )\n",
+    "\n",
+    "    # Compute the SDF value.\n",
+    "    dist = wp.length(nearest_pos - point)\n",
+    "    if query.sign >= 0:\n",
+    "        sdf = dist\n",
+    "    else:\n",
+    "        sdf = -dist\n",
+    "\n",
+    "    color = bourke_color_map(\n",
+    "        -pixel_size * sdf_band_width * 0.5,\n",
+    "        pixel_size * sdf_band_width * 0.5,\n",
+    "        sdf,\n",
+    "    )\n",
+    "\n",
+    "    # Store the colour into our output array.\n",
+    "    out_texture[i, j] = color\n",
+    "\n",
+    "\n",
+    "\"\"\"Initialization\"\"\"\n",
+    "\n",
+    "# Size of the grid used to slice the mesh along the up +Y axis.\n",
+    "grid_size = (10.0, 5.0)\n",
+    "\n",
+    "# Size for each pixel for the rendered texture.\n",
+    "pixel_size = 0.025\n",
+    "\n",
+    "# Width of the SDF's narrow-band, in voxels.\n",
+    "sdf_band_width = 0.3 / pixel_size\n",
+    "\n",
+    "# Resolution of the rendered texture.\n",
+    "resolution = (\n",
+    "    int(wp.ceil(grid_size[0] / pixel_size)),\n",
+    "    int(wp.ceil(grid_size[1] / pixel_size)),\n",
+    ")\n",
+    "\n",
+    "# Adjusted size of the grid to ensure it's a multiple of the pixel size.\n",
+    "grid_size = (\n",
+    "    resolution[0] * pixel_size,\n",
+    "    resolution[1] * pixel_size,\n",
+    ")\n",
+    "\n",
+    "# Amount by which to offset each pixel to account for the grid being centered\n",
+    "# at the origin, and also for each query to be made from the pixel's center.\n",
+    "pixel_offset = (\n",
+    "    (pixel_size - grid_size[0]) * 0.5,\n",
+    "    (pixel_size - grid_size[1]) * 0.5,\n",
+    ")\n",
+    "\n",
+    "# Create a Warp mesh representing the geometry in order to perform BVH queries.\n",
+    "geom_mesh = wp.Mesh(\n",
+    "    points=wp.array(GEOM_POINTS, dtype=wp.vec3),\n",
+    "    indices=wp.array(GEOM_FACE_VERTEX_INDICES, dtype=int),\n",
+    ")\n",
+    "\n",
+    "# Allocate an array to store the rendered texture.\n",
+    "texture = wp.empty(shape=(resolution[1], resolution[0]), dtype=wp.vec3)\n",
+    "\n",
+    "\n",
+    "\"\"\"Evaluation\"\"\"\n",
+    "\n",
+    "wp.launch(\n",
+    "    compute_sdf_slice,\n",
+    "    dim=resolution,\n",
+    "    inputs=(\n",
+    "        pixel_size,\n",
+    "        pixel_offset,\n",
+    "        geom_mesh.id,\n",
+    "        sdf_band_width,\n",
+    "    ),\n",
+    "    outputs=(\n",
+    "        texture,\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\"\"\"Visualization in Matplotlib\"\"\"\n",
+    "\n",
+    "plot_fig = matplotlib.pyplot.figure(figsize=resolution, dpi=1.0)\n",
+    "plot_fig.subplots_adjust(left=0, bottom=0, right=1, top=1)\n",
+    "plot_img = matplotlib.pyplot.imshow(texture.numpy())\n",
+    "plot_img.axes.set_axis_off()\n",
+    "\n",
+    "IPython.display.display(plot_fig)\n",
+    "matplotlib.pyplot.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Another such built-in is `wp.mesh_query_ray()`, which allows finding intersections between a ray and a mesh geometry, as is required for raycasting-like applications."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Triangular mesh data for a geometry\"\"\"\n",
+    "\n",
+    "GEOM_POINTS = (\n",
+    "    ( 0.0     ,  0.0     , -0.5     ), ( 0.0     ,  0.28656 , -0.425325),\n",
+    "    ( 0.249999,  0.081230, -0.425325), ( 0.0     ,  0.447213, -0.223606),\n",
+    "    ( 0.249999,  0.344095, -0.262865), ( 0.425325,  0.138196, -0.223606),\n",
+    "    ( 0.154508, -0.212662, -0.425325), ( 0.404508, -0.131432, -0.262865),\n",
+    "    ( 0.262865, -0.361803, -0.223606), (-0.154508, -0.212662, -0.425325),\n",
+    "    ( 0.0     , -0.425325, -0.262865), (-0.262865, -0.361803, -0.223606),\n",
+    "    (-0.249999,  0.081230, -0.425325), (-0.404508, -0.131432, -0.262865),\n",
+    "    (-0.425325,  0.138196, -0.223606), (-0.249999,  0.344095, -0.262865),\n",
+    "    (-0.154508,  0.475528,  0.0     ), (-0.404508,  0.293892,  0.0     ),\n",
+    "    (-0.262865,  0.361803,  0.223606), (-0.5     ,  0.0     ,  0.0     ),\n",
+    "    (-0.404508, -0.293892,  0.0     ), (-0.425325, -0.138196,  0.223606),\n",
+    "    (-0.154508, -0.475528,  0.0     ), ( 0.154508, -0.475528,  0.0     ),\n",
+    "    ( 0.0     , -0.447213,  0.223606), ( 0.404508, -0.293892,  0.0     ),\n",
+    "    ( 0.5     ,  0.0     ,  0.0     ), ( 0.425325, -0.138196,  0.223606),\n",
+    "    ( 0.404508,  0.293892,  0.0     ), ( 0.154508,  0.475528,  0.0     ),\n",
+    "    ( 0.262865,  0.361803,  0.223606), ( 0.0     ,  0.425325,  0.262865),\n",
+    "    (-0.404508,  0.131432,  0.262865), (-0.249999, -0.344095,  0.262865),\n",
+    "    ( 0.249999, -0.344095,  0.262865), ( 0.404508,  0.131432,  0.262865),\n",
+    "    ( 0.0     ,  0.0     ,  0.5     ), ( 0.154508,  0.212662,  0.425325),\n",
+    "    (-0.154508,  0.212662,  0.425325), ( 0.249999, -0.081230,  0.425325),\n",
+    "    ( 0.0     , -0.262865,  0.425325), (-0.249999, -0.081230,  0.425325),\n",
+    ")\n",
+    "\n",
+    "GEOM_FACE_VERTEX_INDICES = (\n",
+    "     2,  0,  1,  4,  1,  3,  5,  2,  4,  2,  1,  4,  6,  0,  2,  7,\n",
+    "     2,  5,  8,  6,  7,  6,  2,  7,  9,  0,  6, 10,  6,  8, 11,  9,\n",
+    "    10,  9,  6, 10, 12,  0,  9, 13,  9, 11, 14, 12, 13, 12,  9, 13,\n",
+    "     1,  0, 12, 15, 12, 14,  3,  1, 15,  1, 12, 15, 16,  3, 15, 17,\n",
+    "    15, 14, 18, 16, 17, 16, 15, 17, 19, 14, 13, 20, 13, 11, 21, 19,\n",
+    "    20, 19, 13, 20, 22, 11, 10, 23, 10,  8, 24, 22, 23, 22, 10, 23,\n",
+    "    25,  8,  7, 26,  7,  5, 27, 25, 26, 25,  7, 26, 28,  5,  4, 29,\n",
+    "     4,  3, 30, 28, 29, 28,  4, 29, 29,  3, 16, 31, 16, 18, 30, 29,\n",
+    "    31, 29, 16, 31, 17, 14, 19, 32, 19, 21, 18, 17, 32, 17, 19, 32,\n",
+    "    20, 11, 22, 33, 22, 24, 21, 20, 33, 20, 22, 33, 23,  8, 25, 34,\n",
+    "    25, 27, 24, 23, 34, 23, 25, 34, 26,  5, 28, 35, 28, 30, 27, 26,\n",
+    "    35, 26, 28, 35, 38, 36, 37, 31, 37, 30, 18, 38, 31, 38, 37, 31,\n",
+    "    37, 36, 39, 35, 39, 27, 30, 37, 35, 37, 39, 35, 39, 36, 40, 34,\n",
+    "    40, 24, 27, 39, 34, 39, 40, 34, 40, 36, 41, 33, 41, 21, 24, 40,\n",
+    "    33, 40, 41, 33, 41, 36, 38, 32, 38, 18, 21, 41, 32, 41, 38, 32,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\"\"\"Compute\"\"\"\n",
+    "\n",
+    "@wp.kernel\n",
+    "def draw_ray_mesh_intersection(\n",
+    "    pixel_size: float,\n",
+    "    pixel_offset: wp.vec2,\n",
+    "    geom: wp.uint64,\n",
+    "    out_texture: wp.array(dtype=wp.vec3, ndim=2),\n",
+    "):\n",
+    "    j, i = wp.tid()\n",
+    "\n",
+    "    ray_origin = wp.vec3(0.0, 0.0, 2.0)\n",
+    "    ray_dir = wp.normalize(\n",
+    "        wp.vec3(\n",
+    "            float(j) * pixel_size + pixel_offset[0],\n",
+    "            float(i) * pixel_size + pixel_offset[1],\n",
+    "            -1.0,\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "    color = wp.vec3(0.0, 0.0, 0.0)\n",
+    "\n",
+    "    query = wp.mesh_query_ray(geom, ray_origin, ray_dir, 1.0e6)\n",
+    "    if query.result:\n",
+    "        color = query.normal * 0.5 + wp.vec3(0.5, 0.5, 0.5)\n",
+    "\n",
+    "    out_texture[i, j] = color\n",
+    "\n",
+    "\n",
+    "\"\"\"Initialization\"\"\"\n",
+    "\n",
+    "# Size of the grid to cast rays from.\n",
+    "grid_size = (1.5, 1.5)\n",
+    "\n",
+    "# Size for each pixel for the rendered texture.\n",
+    "pixel_size = 0.005\n",
+    "\n",
+    "# Resolution of the rendered texture.\n",
+    "resolution = (\n",
+    "    int(wp.ceil(grid_size[0] / pixel_size)),\n",
+    "    int(wp.ceil(grid_size[1] / pixel_size)),\n",
+    ")\n",
+    "\n",
+    "# Adjusted size of the grid to ensure it's a multiple of the pixel size.\n",
+    "grid_size = (\n",
+    "    resolution[0] * pixel_size,\n",
+    "    resolution[1] * pixel_size,\n",
+    ")\n",
+    "\n",
+    "# Amount by which to offset each pixel to account for the grid being centered\n",
+    "# at the origin, and also for each query to be made from the pixel's center.\n",
+    "pixel_offset = (\n",
+    "    (pixel_size - grid_size[0]) * 0.5,\n",
+    "    (pixel_size - grid_size[1]) * 0.5,\n",
+    ")\n",
+    "\n",
+    "# Create a Warp mesh representing the geometry in order to perform BVH queries.\n",
+    "geom_mesh = wp.Mesh(\n",
+    "    points=wp.array(GEOM_POINTS, dtype=wp.vec3),\n",
+    "    indices=wp.array(GEOM_FACE_VERTEX_INDICES, dtype=int),\n",
+    ")\n",
+    "\n",
+    "# Allocate an array to store the rendered texture.\n",
+    "texture = wp.empty(shape=(resolution[1], resolution[0]), dtype=wp.vec3)\n",
+    "\n",
+    "\n",
+    "\"\"\"Evaluation\"\"\"\n",
+    "\n",
+    "wp.launch(\n",
+    "    draw_ray_mesh_intersection,\n",
+    "    dim=resolution,\n",
+    "    inputs=(\n",
+    "        pixel_size,\n",
+    "        pixel_offset,\n",
+    "        geom_mesh.id,\n",
+    "    ),\n",
+    "    outputs=(\n",
+    "        texture,\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# \"\"\"Visualization in Matplotlib\"\"\"\n",
+    "\n",
+    "plot_fig = matplotlib.pyplot.figure(figsize=resolution, dpi=1.0)\n",
+    "plot_fig.subplots_adjust(left=0, bottom=0, right=1, top=1)\n",
+    "plot_img = matplotlib.pyplot.imshow(texture.numpy(), origin=\"lower\")\n",
+    "plot_img.axes.set_axis_off()\n",
+    "\n",
+    "IPython.display.display(plot_fig)\n",
+    "matplotlib.pyplot.close()"
+   ]
+  }
+ ],
+ "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/notebooks/core_05_volumes.ipynb b/notebooks/core_05_volumes.ipynb
new file mode 100644
index 00000000..a33759eb
--- /dev/null
+++ b/notebooks/core_05_volumes.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Warp Core Tutorial: Volumes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install warp-lang matplotlib pyglet usd-core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warp as wp\n",
+    "\n",
+    "wp.config.quiet = True\n",
+    "\n",
+    "# Explicitly initializing Warp is not necessary but\n",
+    "# we do it here to ensure everything is good to go.\n",
+    "wp.init()\n",
+    "\n",
+    "# Everything else is solely to visualize the results.\n",
+    "import IPython\n",
+    "import matplotlib\n",
+    "import matplotlib.animation\n",
+    "import matplotlib.pyplot\n",
+    "import pyglet\n",
+    "\n",
+    "import warp.render\n",
+    "\n",
+    "matplotlib.pyplot.rc(\"animation\", html=\"jshtml\")\n",
+    "pyglet.options[\"headless\"] = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Surfacing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Compute\"\"\"\n",
+    "\n",
+    "@wp.func\n",
+    "def sdf_create_box(\n",
+    "    pos: wp.vec3,\n",
+    "    size: wp.vec3,\n",
+    "):\n",
+    "    \"\"\"Creates a SDF box primitive.\"\"\"\n",
+    "    # https://iquilezles.org/articles/distfunctions\n",
+    "    q = wp.vec3(\n",
+    "        wp.abs(pos[0]) - size[0],\n",
+    "        wp.abs(pos[1]) - size[1],\n",
+    "        wp.abs(pos[2]) - size[2],\n",
+    "    )\n",
+    "    qp = wp.vec3(wp.max(q[0], 0.0), wp.max(q[1], 0.0), wp.max(q[2], 0.0))\n",
+    "    return wp.length(qp) + wp.min(wp.max(q[0], wp.max(q[1], q[2])), 0.0)\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def sdf_create_torus(\n",
+    "    pos: wp.vec3,\n",
+    "    major_radius: float,\n",
+    "    minor_radius: float,\n",
+    "):\n",
+    "    \"\"\"Creates a SDF torus primitive.\"\"\"\n",
+    "    # https://iquilezles.org/articles/distfunctions\n",
+    "    q = wp.vec2(wp.length(wp.vec2(pos[0], pos[2])) - major_radius, pos[1])\n",
+    "    return wp.length(q) - minor_radius\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def sdf_translate(\n",
+    "    pos: wp.vec3,\n",
+    "    offset: wp.vec3,\n",
+    "):\n",
+    "    \"\"\"Translates a SDF position vector with an offset.\"\"\"\n",
+    "    return pos - offset\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def sdf_rotate(\n",
+    "    pos: wp.vec3,\n",
+    "    angles: wp.vec3,\n",
+    "):\n",
+    "    \"\"\"Rotates a SDF position vector using Euler angles.\"\"\"\n",
+    "    rot = wp.quat_rpy(\n",
+    "        wp.radians(angles[0]),\n",
+    "        wp.radians(angles[1]),\n",
+    "        wp.radians(angles[2]),\n",
+    "    )\n",
+    "    return wp.quat_rotate_inv(rot, pos)\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def sdf_smooth_min(\n",
+    "    a: float,\n",
+    "    b: float,\n",
+    "    radius: float,\n",
+    "):\n",
+    "    \"\"\"Creates a SDF torus primitive.\"\"\"\n",
+    "    # https://iquilezles.org/articles/smin\n",
+    "    h = wp.max(radius - wp.abs(a - b), 0.0) / radius\n",
+    "    return wp.min(a, b) - h * h * h * radius * (1.0 / 6.0)\n",
+    "\n",
+    "\n",
+    "@wp.kernel(enable_backward=False)\n",
+    "def make_field(\n",
+    "    torus_altitude: float,\n",
+    "    torus_major_radius: float,\n",
+    "    torus_minor_radius: float,\n",
+    "    smooth_min_radius: float,\n",
+    "    dim: int,\n",
+    "    time: float,\n",
+    "    out_data: wp.array3d(dtype=float),\n",
+    "):\n",
+    "    \"\"\"Kernel to generate a SDF volume based on primitives.\"\"\"\n",
+    "    i, j, k = wp.tid()\n",
+    "\n",
+    "    # Retrieve the position of the current cell in a normalized [-1, 1] range\n",
+    "    # for each dimension.\n",
+    "    pos = wp.vec3(\n",
+    "        2.0 * ((float(i) + 0.5) / float(dim)) - 1.0,\n",
+    "        2.0 * ((float(j) + 0.5) / float(dim)) - 1.0,\n",
+    "        2.0 * ((float(k) + 0.5) / float(dim)) - 1.0,\n",
+    "    )\n",
+    "\n",
+    "    box = sdf_create_box(\n",
+    "        sdf_translate(pos, wp.vec3(0.0, -0.7, 0.0)),\n",
+    "        wp.vec3(0.9, 0.3, 0.9),\n",
+    "    )\n",
+    "    torus = sdf_create_torus(\n",
+    "        sdf_rotate(\n",
+    "            sdf_translate(pos, wp.vec3(0.0, torus_altitude, 0.0)),\n",
+    "            wp.vec3(wp.sin(time) * 90.0, wp.cos(time) * 45.0, 0.0),\n",
+    "        ),\n",
+    "        torus_major_radius,\n",
+    "        torus_minor_radius,\n",
+    "    )\n",
+    "    out_data[i, j, k] = sdf_smooth_min(box, torus, smooth_min_radius)\n",
+    "\n",
+    "\n",
+    "\"\"\"Initialization\"\"\"\n",
+    "\n",
+    "# Resolution of the rendered image.\n",
+    "resolution = (512, 384)\n",
+    "\n",
+    "# Number of frames to run the sample for.\n",
+    "num_frames = 120\n",
+    "\n",
+    "# Number of frames per second.\n",
+    "fps = 60\n",
+    "\n",
+    "dim = 64\n",
+    "max_verts = int(1e6)\n",
+    "max_tris = int(1e6)\n",
+    "\n",
+    "torus_altitude = -0.5\n",
+    "torus_major_radius = 0.5\n",
+    "torus_minor_radius = 0.1\n",
+    "smooth_min_radius = 0.5\n",
+    "\n",
+    "field = wp.zeros((dim, dim, dim), dtype=float)\n",
+    "mc = wp.MarchingCubes(dim, dim, dim, max_verts, max_tris)\n",
+    "\n",
+    "# Camera settings.\n",
+    "camera_pos = (32.0, 32.0, 150.0)\n",
+    "camera_front = (0.0, -0.2, -1.0)\n",
+    "\n",
+    "# Create a headless OpenGL renderer for our scene.\n",
+    "renderer = warp.render.OpenGLRenderer(\n",
+    "    fps=fps,\n",
+    "    screen_width=resolution[0],\n",
+    "    screen_height=resolution[1],\n",
+    "    camera_pos=camera_pos,\n",
+    "    camera_front=camera_front,\n",
+    "    far_plane=200.0,\n",
+    "    draw_grid=False,\n",
+    "    draw_axis=False,\n",
+    "    vsync=True,\n",
+    "    headless=True,\n",
+    ")\n",
+    "\n",
+    "# Buffer storing the pixels data to visualize the resulting 3D render.\n",
+    "image = wp.empty(shape=(resolution[1], resolution[0], 3), dtype=float)\n",
+    "\n",
+    "\n",
+    "\"\"\"Evaluation\"\"\"\n",
+    "\n",
+    "renders = []\n",
+    "for frame in range(num_frames):\n",
+    "    wp.launch(\n",
+    "        make_field,\n",
+    "        dim=field.shape,\n",
+    "        inputs=(\n",
+    "            torus_altitude,\n",
+    "            torus_major_radius,\n",
+    "            torus_minor_radius,\n",
+    "            smooth_min_radius,\n",
+    "            dim,\n",
+    "            frame / fps,\n",
+    "        ),\n",
+    "        outputs=(field,),\n",
+    "    )\n",
+    "\n",
+    "    mc.surface(field, 0.0)\n",
+    "\n",
+    "    # Use the OpenGL renderer to store an image representing the 3D scene at\n",
+    "    # the current frame.\n",
+    "    renderer.begin_frame(frame / num_frames)\n",
+    "    renderer.render_mesh(\n",
+    "        \"surface\",\n",
+    "        mc.verts.numpy(),\n",
+    "        mc.indices.numpy(),\n",
+    "        colors=((0.35, 0.55, 0.9),) * len(mc.verts),\n",
+    "        update_topology=True,\n",
+    "    )\n",
+    "    renderer.end_frame()\n",
+    "\n",
+    "    # Store the resulting render on host memory.\n",
+    "    renderer.get_pixels(image, split_up_tiles=False, mode=\"rgb\")\n",
+    "    renders.append(wp.clone(image, device=\"cpu\", pinned=True))\n",
+    "\n",
+    "# Ensure that all the kernel launches and copies to CPU have finished.\n",
+    "wp.synchronize()\n",
+    "\n",
+    "\n",
+    "\"\"\"Visualization in Matplotlib\"\"\"\n",
+    "\n",
+    "# Set-up Matplotlib.\n",
+    "plot_fig = matplotlib.pyplot.figure(figsize=resolution, dpi=1.0)\n",
+    "plot_fig.subplots_adjust(left=0, bottom=0, right=1, top=1)\n",
+    "plot_img = matplotlib.pyplot.imshow(renders[0], animated=True)\n",
+    "plot_img.axes.set_axis_off()\n",
+    "\n",
+    "# Run Matplotlib's animation.\n",
+    "plot_anim = matplotlib.animation.FuncAnimation(\n",
+    "    plot_fig,\n",
+    "    lambda frame: plot_img.set_data(renders[frame]),\n",
+    "    frames=num_frames,\n",
+    "    interval=(1.0 / fps) * 1000.0,\n",
+    ")\n",
+    "\n",
+    "# Display the result.\n",
+    "IPython.display.display(plot_anim)\n",
+    "matplotlib.pyplot.close()"
+   ]
+  }
+ ],
+ "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/pyproject.toml b/pyproject.toml
index 231446d6..391d0df5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ cache-dir = ".cache/ruff"
 line-length = 120
 indent-width = 4
 extend-exclude = [
+    "notebooks",
     "warp/native/cutlass/",
     "warp/thirdparty/appdirs.py",
     "warp/thirdparty/dlpack.py",

From b10d61f63510cccfb75e278a1bc76e5676271ed0 Mon Sep 17 00:00:00 2001
From: Christopher Crouzet <christopher@crouzet.pm>
Date: Wed, 6 Nov 2024 15:13:19 +1300
Subject: [PATCH 04/10] Support topology updates in the OpenGL renderer

---
 CHANGELOG.md                 |   1 +
 warp/render/render_opengl.py | 183 ++++++++++++++++++++++++-----------
 2 files changed, 126 insertions(+), 58 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f61ec69..61e21c3f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,7 @@
 - Fix `wp.printf()` erroring out when no variadic arguments are passed ([GH-333](https://github.com/NVIDIA/warp/issues/333)).
 - Fix memory access issues in soft-rigid contact collisions ([GH-362](https://github.com/NVIDIA/warp/issues/362)).
 - Fix gradient propagation for in-place addition/subtraction operations on custom vector-type arrays.
+- Fix topology updates not being supported by the the OpenGL renderer.
 
 ## [1.4.2] - 2024-11-13
 
diff --git a/warp/render/render_opengl.py b/warp/render/render_opengl.py
index c8dea5a1..274273c3 100644
--- a/warp/render/render_opengl.py
+++ b/warp/render/render_opengl.py
@@ -310,14 +310,15 @@ def update_vbo_transforms(
 @wp.kernel
 def update_vbo_vertices(
     points: wp.array(dtype=wp.vec3),
+    scale: wp.vec3,
     # outputs
     vbo_vertices: wp.array(dtype=float, ndim=2),
 ):
     tid = wp.tid()
     p = points[tid]
-    vbo_vertices[tid, 0] = p[0]
-    vbo_vertices[tid, 1] = p[1]
-    vbo_vertices[tid, 2] = p[2]
+    vbo_vertices[tid, 0] = p[0] * scale[0]
+    vbo_vertices[tid, 1] = p[1] * scale[1]
+    vbo_vertices[tid, 2] = p[2] * scale[2]
 
 
 @wp.kernel
@@ -375,13 +376,14 @@ def update_line_transforms(
 def compute_gfx_vertices(
     indices: wp.array(dtype=int, ndim=2),
     vertices: wp.array(dtype=wp.vec3, ndim=1),
+    scale: wp.vec3,
     # outputs
     gfx_vertices: wp.array(dtype=float, ndim=2),
 ):
     tid = wp.tid()
-    v0 = vertices[indices[tid, 0]]
-    v1 = vertices[indices[tid, 1]]
-    v2 = vertices[indices[tid, 2]]
+    v0 = vertices[indices[tid, 0]] * scale[0]
+    v1 = vertices[indices[tid, 1]] * scale[1]
+    v2 = vertices[indices[tid, 2]] * scale[2]
     i = tid * 3
     j = i + 1
     k = i + 2
@@ -410,6 +412,7 @@ def compute_gfx_vertices(
 def compute_average_normals(
     indices: wp.array(dtype=int, ndim=2),
     vertices: wp.array(dtype=wp.vec3),
+    scale: wp.vec3,
     # outputs
     normals: wp.array(dtype=wp.vec3),
     faces_per_vertex: wp.array(dtype=int),
@@ -418,9 +421,9 @@ def compute_average_normals(
     i = indices[tid, 0]
     j = indices[tid, 1]
     k = indices[tid, 2]
-    v0 = vertices[i]
-    v1 = vertices[j]
-    v2 = vertices[k]
+    v0 = vertices[i] * scale[0]
+    v1 = vertices[j] * scale[1]
+    v2 = vertices[k] * scale[2]
     n = wp.normalize(wp.cross(v1 - v0, v2 - v0))
     wp.atomic_add(normals, i, n)
     wp.atomic_add(faces_per_vertex, i, 1)
@@ -435,15 +438,16 @@ def assemble_gfx_vertices(
     vertices: wp.array(dtype=wp.vec3, ndim=1),
     normals: wp.array(dtype=wp.vec3),
     faces_per_vertex: wp.array(dtype=int),
+    scale: wp.vec3,
     # outputs
     gfx_vertices: wp.array(dtype=float, ndim=2),
 ):
     tid = wp.tid()
     v = vertices[tid]
     n = normals[tid] / float(faces_per_vertex[tid])
-    gfx_vertices[tid, 0] = v[0]
-    gfx_vertices[tid, 1] = v[1]
-    gfx_vertices[tid, 2] = v[2]
+    gfx_vertices[tid, 0] = v[0] * scale[0]
+    gfx_vertices[tid, 1] = v[1] * scale[1]
+    gfx_vertices[tid, 2] = v[2] * scale[2]
     gfx_vertices[tid, 3] = n[0]
     gfx_vertices[tid, 4] = n[1]
     gfx_vertices[tid, 5] = n[2]
@@ -2200,6 +2204,25 @@ def register_shape(self, geo_hash, vertices, indices, color1=None, color2=None):
 
         return shape
 
+    def deregister_shape(self, shape):
+        from pyglet import gl
+
+        if shape not in self._shape_gl_buffers:
+            return
+
+        vao, vbo, ebo, _, vertex_cuda_buffer = self._shape_gl_buffers[shape]
+        try:
+            gl.glDeleteVertexArrays(1, vao)
+            gl.glDeleteBuffers(1, vbo)
+            gl.glDeleteBuffers(1, ebo)
+        except gl.GLException:
+            pass
+
+        _, _, _, _, geo_hash = self._shapes[shape]
+        del self._shape_geo_hash[geo_hash]
+        del self._shape_gl_buffers[shape]
+        self._shapes.pop(shape)
+
     def add_shape_instance(
         self,
         name: str,
@@ -2227,6 +2250,19 @@ def add_shape_instance(
         self._instance_count = len(self._instances)
         return instance
 
+    def remove_shape_instance(self, name: str):
+        if name not in self._instances:
+            return
+
+        instance, _, shape, _, _, _, _, _ = self._instances[name]
+
+        self._shape_instances[shape].remove(instance)
+        self._instance_count = len(self._instances)
+        self._add_shape_instances = self._instance_count > 0
+        del self._instance_shape[instance]
+        del self._instance_custom_ids[instance]
+        del self._instances[name]
+
     def update_instance_colors(self):
         from pyglet import gl
 
@@ -2243,15 +2279,13 @@ def update_instance_colors(self):
         colors2 = np.array(colors2, dtype=np.float32)
 
         # create buffer for checkerboard colors
-        if self._instance_color1_buffer is None:
-            self._instance_color1_buffer = gl.GLuint()
-            gl.glGenBuffers(1, self._instance_color1_buffer)
+        self._instance_color1_buffer = gl.GLuint()
+        gl.glGenBuffers(1, self._instance_color1_buffer)
         gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._instance_color1_buffer)
         gl.glBufferData(gl.GL_ARRAY_BUFFER, colors1.nbytes, colors1.ctypes.data, gl.GL_STATIC_DRAW)
 
-        if self._instance_color2_buffer is None:
-            self._instance_color2_buffer = gl.GLuint()
-            gl.glGenBuffers(1, self._instance_color2_buffer)
+        self._instance_color2_buffer = gl.GLuint()
+        gl.glGenBuffers(1, self._instance_color2_buffer)
         gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._instance_color2_buffer)
         gl.glBufferData(gl.GL_ARRAY_BUFFER, colors2.nbytes, colors2.ctypes.data, gl.GL_STATIC_DRAW)
 
@@ -2891,52 +2925,85 @@ def render_mesh(
         """
         if colors is not None:
             colors = np.array(colors, dtype=np.float32)
-        points = np.array(points, dtype=np.float32) * np.array(scale, dtype=np.float32)
+
+        points = np.array(points, dtype=np.float32)
+        point_count = len(points)
+
         indices = np.array(indices, dtype=np.int32).reshape((-1, 3))
+        idx_count = len(indices)
+
+        geo_hash = hash((indices.tobytes(),))
+
         if name in self._instances:
-            self.update_shape_instance(name, pos, rot, color1=colors)
+            # We've already registered this mesh instance and its associated shape.
             shape = self._instances[name][2]
-            self.update_shape_vertices(shape, points)
-            return
-        geo_hash = hash((points.tobytes(), indices.tobytes()))
-        if geo_hash in self._shape_geo_hash:
-            shape = self._shape_geo_hash[geo_hash]
-            if self.update_shape_instance(name, pos, rot, color1=colors):
-                return shape
         else:
-            if smooth_shading:
-                normals = wp.zeros(len(points), dtype=wp.vec3)
-                vertices = wp.array(points, dtype=wp.vec3)
-                faces_per_vertex = wp.zeros(len(points), dtype=int)
-                wp.launch(
-                    compute_average_normals,
-                    dim=len(indices),
-                    inputs=[wp.array(indices, dtype=int), vertices],
-                    outputs=[normals, faces_per_vertex],
-                )
-                gfx_vertices = wp.zeros((len(points), 8), dtype=float)
-                wp.launch(
-                    assemble_gfx_vertices,
-                    dim=len(points),
-                    inputs=[vertices, normals, faces_per_vertex],
-                    outputs=[gfx_vertices],
-                )
-                gfx_vertices = gfx_vertices.numpy()
-                gfx_indices = indices.flatten()
+            if geo_hash in self._shape_geo_hash:
+                # We've only registered the shape, which can happen when `is_template` is `True`.
+                shape = self._shape_geo_hash[geo_hash]
             else:
-                gfx_vertices = wp.zeros((len(indices) * 3, 8), dtype=float)
-                wp.launch(
-                    compute_gfx_vertices,
-                    dim=len(indices),
-                    inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3)],
-                    outputs=[gfx_vertices],
-                )
-                gfx_vertices = gfx_vertices.numpy()
-                gfx_indices = np.arange(len(indices) * 3)
-            shape = self.register_shape(geo_hash, gfx_vertices, gfx_indices)
+                shape = None
+
+        # Check if we already have that shape registered and can perform
+        # minimal updates since the topology is not changing, before exiting.
+        if not update_topology:
+            if name in self._instances:
+                # Update the instance's transform.
+                self.update_shape_instance(name, pos, rot, color1=colors)
+
+            if shape is not None:
+                # Update the shape's point positions.
+                self.update_shape_vertices(shape, points, scale)
+                return shape
+
+        # No existing shape for the given mesh was found, or its topology may have changed,
+        # so we need to define a new one either way.
+        if smooth_shading:
+            normals = wp.zeros(point_count, dtype=wp.vec3)
+            vertices = wp.array(points, dtype=wp.vec3)
+            faces_per_vertex = wp.zeros(point_count, dtype=int)
+            wp.launch(
+                compute_average_normals,
+                dim=idx_count,
+                inputs=[wp.array(indices, dtype=int), vertices, scale],
+                outputs=[normals, faces_per_vertex],
+            )
+            gfx_vertices = wp.zeros((point_count, 8), dtype=float)
+            wp.launch(
+                assemble_gfx_vertices,
+                dim=point_count,
+                inputs=[vertices, normals, faces_per_vertex, scale],
+                outputs=[gfx_vertices],
+            )
+            gfx_vertices = gfx_vertices.numpy()
+            gfx_indices = indices.flatten()
+        else:
+            gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
+            wp.launch(
+                compute_gfx_vertices,
+                dim=idx_count,
+                inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3), scale],
+                outputs=[gfx_vertices],
+            )
+            gfx_vertices = gfx_vertices.numpy()
+            gfx_indices = np.arange(idx_count * 3)
+
+        # If there was a shape for the given mesh, clean it up.
+        if shape is not None:
+            self.deregister_shape(shape)
+
+        # If there was an instance for the given mesh, clean it up.
+        if name in self._instances:
+            self.remove_shape_instance(name)
+
+        # Register the new shape.
+        shape = self.register_shape(geo_hash, gfx_vertices, gfx_indices)
+
         if not is_template:
+            # Create a new instance if necessary.
             body = self._resolve_body_id(parent_body)
             self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
+
         return shape
 
     def render_arrow(
@@ -3101,7 +3168,7 @@ def render_line_strip(self, name: str, vertices, color: tuple = None, radius: fl
         lines = np.array(lines)
         self._render_lines(name, lines, color, radius)
 
-    def update_shape_vertices(self, shape, points):
+    def update_shape_vertices(self, shape, points, scale):
         if isinstance(points, wp.array):
             wp_points = points.to(self._device)
         else:
@@ -3114,7 +3181,7 @@ def update_shape_vertices(self, shape, points):
         wp.launch(
             update_vbo_vertices,
             dim=vertices_shape[0],
-            inputs=[wp_points],
+            inputs=[wp_points, scale],
             outputs=[vbo_vertices],
             device=self._device,
         )

From 6b216fe1b84178065168eee2892c5eae22c01de4 Mon Sep 17 00:00:00 2001
From: Zach Corse <zcorse@nvidia.com>
Date: Wed, 27 Nov 2024 06:53:06 -0800
Subject: [PATCH 05/10] Add custom vec to non-atomic in-place add test cases

---
 warp/tests/test_array.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/warp/tests/test_array.py b/warp/tests/test_array.py
index 61175a2c..7617c968 100644
--- a/warp/tests/test_array.py
+++ b/warp/tests/test_array.py
@@ -2580,6 +2580,9 @@ def inplace_add_non_atomic_types(x: wp.array(dtype=Any), y: wp.array(dtype=Any))
     x[i] += y[i]
 
 
+uint16vec3 = wp.vec(length=3, dtype=wp.uint16)
+
+
 def test_array_inplace_non_diff_ops(test, device):
     N = 3
     x1 = wp.full(N, value=10.0, dtype=float, device=device)
@@ -2593,7 +2596,7 @@ def test_array_inplace_non_diff_ops(test, device):
     wp.launch(inplace_div_1d, N, inputs=[x1, y1], device=device)
     assert_np_equal(x1.numpy(), np.full(N, fill_value=2.0, dtype=float))
 
-    for dtype in wp.types.non_atomic_types + (wp.vec2b, wp.vec2ub, wp.vec2s, wp.vec2us):
+    for dtype in wp.types.non_atomic_types + (wp.vec2b, wp.vec2ub, wp.vec2s, wp.vec2us, uint16vec3):
         x = wp.full(N, value=0, dtype=dtype, device=device)
         y = wp.full(N, value=1, dtype=dtype, device=device)
 

From 82348da25fe1595ee7155473b6ff4cbed9857427 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Wed, 27 Nov 2024 20:00:42 -0800
Subject: [PATCH 06/10] Warp + Tile: cuFFTDx filtering example

---
 warp/examples/tile/example_tile_filtering.py | 98 ++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 warp/examples/tile/example_tile_filtering.py

diff --git a/warp/examples/tile/example_tile_filtering.py b/warp/examples/tile/example_tile_filtering.py
new file mode 100644
index 00000000..a7fe0431
--- /dev/null
+++ b/warp/examples/tile/example_tile_filtering.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+###########################################################################
+# Example Tile Filtering
+#
+# Shows how to write a simple filtering kernel using Warp FFT tile
+# primitives.
+#
+###########################################################################
+
+import numpy as np
+
+import warp as wp
+
+wp.set_module_options({"enable_backward": False})
+
+BLOCK_DIM = 128
+TILE_M = 1
+TILE_N = 512
+
+scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
+
+
+def cplx(array):
+    return array[..., 0] + 1j * array[..., 1]
+
+
+@wp.func
+def cplx_prod(x: wp.vec2d, y: wp.vec2d):
+    return wp.cw_mul(wp.vec2d(x[0] * y[0] - x[1] * y[1], x[0] * y[1] + x[1] * y[0]), scale)
+
+
+@wp.kernel
+def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d), z: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    b = wp.tile_load(y, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    c = wp.tile_map(cplx_prod, a, b)
+    wp.tile_ifft(c)
+    wp.tile_store(z, i, j, c)
+
+
+if __name__ == "__main__":
+    rng = np.random.default_rng(42)
+
+    # Create noisy input signal
+    t = np.linspace(0, 2 * np.pi, TILE_N, dtype=np.float64)
+    x = np.sin(t) + 0.5 * rng.random(TILE_N, dtype=np.float64)
+
+    # Create filter. This filter keeps only ~10% of the frequencies at the center
+    # of the spectrum.
+    f = np.ones_like(x)
+    freq = np.fft.fftfreq(TILE_N)
+    f[np.abs(freq) > 0.05] = 0.0
+    f[np.abs(freq) <= 0.05] = 1.0
+
+    # Create Warp input data
+    # We use vec2d to hold complex numbers
+    x_h = np.zeros((TILE_M, TILE_N, 2), dtype=np.float64)
+    f_h = np.zeros_like(x_h)
+    y_h = np.zeros_like(f_h)
+
+    x_h[:, :, 0] = x
+    f_h[:, :, 0] = f
+
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    f_wp = wp.array2d(f_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+
+    wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp, f_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+
+    # Extract output and compare with numpy
+    x_np = cplx(x_h)
+    f_np = cplx(f_h)
+    y_test = cplx(y_wp.numpy())
+    y_ref = np.fft.ifft(f_np * np.fft.fft(x_np))
+    assert np.allclose(y_ref, y_test)
+
+try:
+    import matplotlib.pyplot as plt
+
+    plt.figure()
+    plt.plot(
+        x,
+        "r-",
+        label="Original",
+    )
+    plt.plot(y_test[0, :].real, "b.-", label="Smoothed")
+    plt.legend()
+    plt.savefig("filter.png")
+except ModuleNotFoundError:
+    print("Matplotlib not available; skipping figure")

From 3b9812dd9b63f2b36419344fa608fe954f182076 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 28 Nov 2024 19:05:02 -0800
Subject: [PATCH 07/10] Update tiles.rst

---
 docs/modules/tiles.rst | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index b622c20b..36fa5fff 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -3,9 +3,7 @@ Tiles
 
 .. warning:: Tile-based operations in Warp are under preview, APIs are subject to change.
 
-Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high-performance kernels that can leverage cooperative operations on modern GPUs.
-
-Warp 1.5.0 introduces tile extensions that expose a block-based programming to Warp kernels. 
+Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high-performance kernels that can leverage cooperative operations on modern GPUs. With Warp 1.5.0 developers now have access to new tile-based programming primitives in Warp kernels. Leveraging cuBLASDx and cuFFTDx, these new tools provide developers with efficient matrix multiplication and Fourier transforms for accelerated simulation and scientific computing. 
 
 Requirements
 ------------
@@ -35,7 +33,7 @@ In the following example, we launch a grid of threads where each block is respon
 
         # load a row from global memory
         t = wp.tile_load(array[i], i, TILE_SIZE)
-        s = wp.sum(t)
+        s = wp.tile_sum(t)
         ...
 
     wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a], block_dim=TILE_THREADS)
@@ -62,7 +60,7 @@ In Warp, tile objects are 2D arrays of data where the tile elements may be scala
 
         # load a 2d tile from global memory
         t = wp.tile_load(array, i, j, m=TILE_M, n=TILE_N)
-        s = wp.sum(t)
+        s = wp.tile_sum(t)
         ...
 
     wp.launch_tiled(compute, dim=[a.shape[0]/TILE_M, a.shape[1]/TILE_N], inputs=[a], block_dim=TILE_THREADS)
@@ -193,8 +191,8 @@ Maps/Reductions
 * :func:`warp.tile_min`
 * :func:`warp.tile_max`
 
-MathDx
-^^^^^^
+Linear Algebra
+^^^^^^^^^^^^^^
 
 * :func:`warp.tile_matmul`
 * :func:`warp.tile_transpose`

From dc488eb22d73dab101ec8991eaeffa40519eeda1 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Sun, 1 Dec 2024 14:10:48 -0800
Subject: [PATCH 08/10] Doc build fix after changing section titles

---
 docs/modules/tiles.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index 36fa5fff..61734e34 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -239,7 +239,7 @@ Please see the :ref:`differentiability` section for more details.
 Building with MathDx
 --------------------
 
-The tile operations described in `MathDx`_ require Warp to be built with the MathDx library.
+The tile operations described in `Linear Algebra`_ require Warp to be built with the MathDx library.
 Starting with Warp 1.5.0, PyPI distributions will come with out-of-the-box support for tile operations
 leveraging MathDx APIs.
 

From 2c3d619a97b2292caf891d194bacba40192287fc Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 2 Dec 2024 13:29:35 +1300
Subject: [PATCH 09/10] Cosmetic changes to the tile filtering example

---
 warp/examples/tile/example_tile_filtering.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/warp/examples/tile/example_tile_filtering.py b/warp/examples/tile/example_tile_filtering.py
index a7fe0431..90968638 100644
--- a/warp/examples/tile/example_tile_filtering.py
+++ b/warp/examples/tile/example_tile_filtering.py
@@ -85,14 +85,21 @@ def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d), z:
 try:
     import matplotlib.pyplot as plt
 
-    plt.figure()
-    plt.plot(
+    fig, ax = plt.subplots(figsize=(10, 5))
+
+    ax.plot(
         x,
-        "r-",
+        color="#DDDDDD",
+        linewidth=2,
         label="Original",
     )
-    plt.plot(y_test[0, :].real, "b.-", label="Smoothed")
-    plt.legend()
-    plt.savefig("filter.png")
+    ax.plot(y_test[0, :].real, color="#76B900", linewidth=3, label="Smoothed")
+
+    ax.legend()
+    ax.grid(True)
+
+    plt.tight_layout()
+    plt.show()
+
 except ModuleNotFoundError:
     print("Matplotlib not available; skipping figure")

From 99d147eaa30882a933660f4e48b44ea860010f02 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 2 Dec 2024 09:17:10 -0800
Subject: [PATCH 10/10] Warp 1.5.0 Stable

---
 CHANGELOG.md                              | 21 +++++++++++----------
 README.md                                 |  6 +++---
 VERSION.md                                |  2 +-
 exts/omni.warp.core/config/extension.toml |  2 +-
 exts/omni.warp.core/docs/CHANGELOG.md     | 20 +++++++++++---------
 exts/omni.warp/config/extension.toml      |  4 ++--
 exts/omni.warp/docs/CHANGELOG.md          | 20 +++++++++++---------
 warp/config.py                            |  2 +-
 8 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61e21c3f..aac226aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@
   [documentation](https://nvidia.github.io/warp/modules/tiles.html) for details.
 - Expose a `reversed()` built-in for iterators ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
 - Support for saving Volumes into `.nvdb` files with the `save_to_nvdb` method.
-- warp.fem: Add `Trimesh3D` and `Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
+- warp.fem: Add `wp.fem.Trimesh3D` and `wp.fem.Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
 - warp.fem: Add `"add"` option to `wp.fem.integrate()` for accumulating integration result to existing output.
 - warp.fem: Add `"assembly"` option to `wp.fem.integrate()` for selecting between more memory-efficient or more
   computationally efficient integration algorithms.
@@ -22,19 +22,19 @@
   hard-coded value of 0.01 ([GH-329](https://github.com/NVIDIA/warp/issues/329)).
 - Add a `particle_radius` parameter to `wp.sim.ModelBuilder.add_cloth_mesh()` and `wp.sim.ModelBuilder.add_cloth_grid()`
   to set a uniform radius for the added particles.
-- Document `array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
+- Document `wp.array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
 - Document time-to-compile tradeoffs when using vector component assignment statements in kernels.
-- Add introductory Jupyter notebooks.
+- Add introductory Jupyter notebooks to the `notebooks` directory.
 
 ### Changed
 
 - Drop support for Python 3.7; Python 3.8 is now the minimum-supported version.
 - Promote the `wp.Int`, `wp.Float`, and `wp.Scalar` generic annotation types to the public API.
 - warp.fem: Simplify querying neighboring cell quantities when integrating on sides using new
-  `warp.fem.cells()`, `warp.fem.to_inner_cell()`, `warp.fem.to_outer_cell()` operators.
+  `wp.fem.cells()`, `wp.fem.to_inner_cell()`, `wp.fem.to_outer_cell()` operators.
 - Show an error message when the type returned by a function differs from its annotation, which would have led to the compilation stage failing.
-- Clarify that `randn()` samples a normal distribution of mean 0 and variance 1.
-- Raise error when passing more than 32 variadic argument to the `wp.printf` built-in.
+- Clarify that `wp.randn()` samples a normal distribution of mean 0 and variance 1.
+- Raise error when passing more than 32 variadic argument to the `wp.printf()` built-in.
 
 ### Fixed
 
@@ -42,14 +42,14 @@
 - warp.fem: Fix tri-cubic shape functions on quadrilateral meshes.
 - warp.fem: Fix caching of integrand kernels when changing code-generation options.
 - Fix `wp.expect_neq()` overloads missing for scalar types.
-- Fix the OpenGL renderer's window not closing when clicking the X button.
-- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
-- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
 - Fix an error when a `wp.kernel` or a `wp.func` object is annotated to return a `None` value.
 - Fix error when reading multi-volume, BLOSC-compressed `.nvdb` files.
 - Fix `wp.printf()` erroring out when no variadic arguments are passed ([GH-333](https://github.com/NVIDIA/warp/issues/333)).
 - Fix memory access issues in soft-rigid contact collisions ([GH-362](https://github.com/NVIDIA/warp/issues/362)).
 - Fix gradient propagation for in-place addition/subtraction operations on custom vector-type arrays.
+- Fix the OpenGL renderer's window not closing when clicking the X button.
+- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
+- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
 - Fix topology updates not being supported by the the OpenGL renderer.
 
 ## [1.4.2] - 2024-11-13
@@ -1220,7 +1220,8 @@
 
 - Initial publish for alpha testing
 
-[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.2...HEAD
+[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.5.0...HEAD
+[1.5.0]: https://github.com/NVIDIA/warp/releases/tag/v1.5.0
 [1.4.2]: https://github.com/NVIDIA/warp/releases/tag/v1.4.2
 [1.4.1]: https://github.com/NVIDIA/warp/releases/tag/v1.4.1
 [1.4.0]: https://github.com/NVIDIA/warp/releases/tag/v1.4.0
diff --git a/README.md b/README.md
index c5aa77ef..71aa2baf 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,9 @@ the `pip install` command, e.g.
 
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.2/warp_lang-1.4.2+cu11-py3-none-manylinux2014_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.2/warp_lang-1.4.2+cu11-py3-none-manylinux2014_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.2/warp_lang-1.4.2+cu11-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.5.0/warp_lang-1.5.0+cu11-py3-none-manylinux2014_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.5.0/warp_lang-1.5.0+cu11-py3-none-manylinux2014_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.5.0/warp_lang-1.5.0+cu11-py3-none-win_amd64.whl`             |
 
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
 
diff --git a/VERSION.md b/VERSION.md
index f984a0ec..bc80560f 100644
--- a/VERSION.md
+++ b/VERSION.md
@@ -1 +1 @@
-1.5.0-rc.2
+1.5.0
diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml
index 625adcc5..85655c40 100644
--- a/exts/omni.warp.core/config/extension.toml
+++ b/exts/omni.warp.core/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.5.0-rc.2"
+version = "1.5.0"
 authors = ["NVIDIA"]
 title = "Warp Core"
 description="The core Warp Python module"
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 7f45b1fa..115f1771 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -1,6 +1,6 @@
 # CHANGELOG
 
-## [1.5.0-rc.2] - 2024-12-02
+## [1.5.0] - 2024-12-02
 
 ### Added
 
@@ -8,7 +8,7 @@
   [documentation](https://nvidia.github.io/warp/modules/tiles.html) for details.
 - Expose a `reversed()` built-in for iterators ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
 - Support for saving Volumes into `.nvdb` files with the `save_to_nvdb` method.
-- warp.fem: Add `Trimesh3D` and `Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
+- warp.fem: Add `wp.fem.Trimesh3D` and `wp.fem.Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
 - warp.fem: Add `"add"` option to `wp.fem.integrate()` for accumulating integration result to existing output.
 - warp.fem: Add `"assembly"` option to `wp.fem.integrate()` for selecting between more memory-efficient or more
   computationally efficient integration algorithms.
@@ -22,18 +22,19 @@
   hard-coded value of 0.01 ([GH-329](https://github.com/NVIDIA/warp/issues/329)).
 - Add a `particle_radius` parameter to `wp.sim.ModelBuilder.add_cloth_mesh()` and `wp.sim.ModelBuilder.add_cloth_grid()`
   to set a uniform radius for the added particles.
-- Document `array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
+- Document `wp.array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
 - Document time-to-compile tradeoffs when using vector component assignment statements in kernels.
+- Add introductory Jupyter notebooks to the `notebooks` directory.
 
 ### Changed
 
 - Drop support for Python 3.7; Python 3.8 is now the minimum-supported version.
 - Promote the `wp.Int`, `wp.Float`, and `wp.Scalar` generic annotation types to the public API.
 - warp.fem: Simplify querying neighboring cell quantities when integrating on sides using new
-  `warp.fem.cells()`, `warp.fem.to_inner_cell()`, `warp.fem.to_outer_cell()` operators.
+  `wp.fem.cells()`, `wp.fem.to_inner_cell()`, `wp.fem.to_outer_cell()` operators.
 - Show an error message when the type returned by a function differs from its annotation, which would have led to the compilation stage failing.
-- Clarify that `randn()` samples a normal distribution of mean 0 and variance 1.
-- Raise error when passing more than 32 variadic argument to the `wp.printf` built-in.
+- Clarify that `wp.randn()` samples a normal distribution of mean 0 and variance 1.
+- Raise error when passing more than 32 variadic argument to the `wp.printf()` built-in.
 
 ### Fixed
 
@@ -41,14 +42,15 @@
 - warp.fem: Fix tri-cubic shape functions on quadrilateral meshes.
 - warp.fem: Fix caching of integrand kernels when changing code-generation options.
 - Fix `wp.expect_neq()` overloads missing for scalar types.
-- Fix the OpenGL renderer's window not closing when clicking the X button.
-- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
-- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
 - Fix an error when a `wp.kernel` or a `wp.func` object is annotated to return a `None` value.
 - Fix error when reading multi-volume, BLOSC-compressed `.nvdb` files.
 - Fix `wp.printf()` erroring out when no variadic arguments are passed ([GH-333](https://github.com/NVIDIA/warp/issues/333)).
 - Fix memory access issues in soft-rigid contact collisions ([GH-362](https://github.com/NVIDIA/warp/issues/362)).
 - Fix gradient propagation for in-place addition/subtraction operations on custom vector-type arrays.
+- Fix the OpenGL renderer's window not closing when clicking the X button.
+- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
+- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
+- Fix topology updates not being supported by the the OpenGL renderer.
 
 ## [1.4.2] - 2024-11-13
 
diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml
index 525fde25..5406cb0c 100644
--- a/exts/omni.warp/config/extension.toml
+++ b/exts/omni.warp/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.5.0-rc.2"
+version = "1.5.0"
 authors = ["NVIDIA"]
 title = "Warp"
 description="Warp OmniGraph Nodes and Sample Scenes"
@@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"]
 "omni.timeline" = {}
 "omni.ui" = {optional = true}
 "omni.usd" = {}
-"omni.warp.core" = {version = "1.5.0-rc.2", exact = true}
+"omni.warp.core" = {version = "1.5.0", exact = true}
 
 [[python.module]]
 name = "omni.warp._extension"
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 7f45b1fa..115f1771 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -1,6 +1,6 @@
 # CHANGELOG
 
-## [1.5.0-rc.2] - 2024-12-02
+## [1.5.0] - 2024-12-02
 
 ### Added
 
@@ -8,7 +8,7 @@
   [documentation](https://nvidia.github.io/warp/modules/tiles.html) for details.
 - Expose a `reversed()` built-in for iterators ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
 - Support for saving Volumes into `.nvdb` files with the `save_to_nvdb` method.
-- warp.fem: Add `Trimesh3D` and `Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
+- warp.fem: Add `wp.fem.Trimesh3D` and `wp.fem.Quadmesh3D` geometry types for 3D surfaces with new `example_distortion_energy` example.
 - warp.fem: Add `"add"` option to `wp.fem.integrate()` for accumulating integration result to existing output.
 - warp.fem: Add `"assembly"` option to `wp.fem.integrate()` for selecting between more memory-efficient or more
   computationally efficient integration algorithms.
@@ -22,18 +22,19 @@
   hard-coded value of 0.01 ([GH-329](https://github.com/NVIDIA/warp/issues/329)).
 - Add a `particle_radius` parameter to `wp.sim.ModelBuilder.add_cloth_mesh()` and `wp.sim.ModelBuilder.add_cloth_grid()`
   to set a uniform radius for the added particles.
-- Document `array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
+- Document `wp.array` attributes ([GH-364](https://github.com/NVIDIA/warp/issues/364)).
 - Document time-to-compile tradeoffs when using vector component assignment statements in kernels.
+- Add introductory Jupyter notebooks to the `notebooks` directory.
 
 ### Changed
 
 - Drop support for Python 3.7; Python 3.8 is now the minimum-supported version.
 - Promote the `wp.Int`, `wp.Float`, and `wp.Scalar` generic annotation types to the public API.
 - warp.fem: Simplify querying neighboring cell quantities when integrating on sides using new
-  `warp.fem.cells()`, `warp.fem.to_inner_cell()`, `warp.fem.to_outer_cell()` operators.
+  `wp.fem.cells()`, `wp.fem.to_inner_cell()`, `wp.fem.to_outer_cell()` operators.
 - Show an error message when the type returned by a function differs from its annotation, which would have led to the compilation stage failing.
-- Clarify that `randn()` samples a normal distribution of mean 0 and variance 1.
-- Raise error when passing more than 32 variadic argument to the `wp.printf` built-in.
+- Clarify that `wp.randn()` samples a normal distribution of mean 0 and variance 1.
+- Raise error when passing more than 32 variadic argument to the `wp.printf()` built-in.
 
 ### Fixed
 
@@ -41,14 +42,15 @@
 - warp.fem: Fix tri-cubic shape functions on quadrilateral meshes.
 - warp.fem: Fix caching of integrand kernels when changing code-generation options.
 - Fix `wp.expect_neq()` overloads missing for scalar types.
-- Fix the OpenGL renderer's window not closing when clicking the X button.
-- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
-- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
 - Fix an error when a `wp.kernel` or a `wp.func` object is annotated to return a `None` value.
 - Fix error when reading multi-volume, BLOSC-compressed `.nvdb` files.
 - Fix `wp.printf()` erroring out when no variadic arguments are passed ([GH-333](https://github.com/NVIDIA/warp/issues/333)).
 - Fix memory access issues in soft-rigid contact collisions ([GH-362](https://github.com/NVIDIA/warp/issues/362)).
 - Fix gradient propagation for in-place addition/subtraction operations on custom vector-type arrays.
+- Fix the OpenGL renderer's window not closing when clicking the X button.
+- Fix the OpenGL renderer's camera snapping to a different direction from the initial camera's orientation when first looking around.
+- Fix custom colors being ignored when rendering meshes in OpenGL ([GH-343](https://github.com/NVIDIA/warp/issues/343)).
+- Fix topology updates not being supported by the the OpenGL renderer.
 
 ## [1.4.2] - 2024-11-13
 
diff --git a/warp/config.py b/warp/config.py
index f487c03f..4907238a 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -7,7 +7,7 @@
 
 from typing import Optional
 
-version: str = "1.5.0-rc.2"
+version: str = "1.5.0"
 """Warp version string"""
 
 verify_fp: bool = False